In [1]:
import pandas as pd
import os
import re
from functools import reduce
import numpy as np
import time

import geopandas as gpd
from pyproj import Proj
import geojson

In [2]:
data_path = 'data/'
train_path = data_path + 'train/'
test_2019_path = data_path + 'test/test 2019/'
test_2020_path = data_path + 'test/test 2020/'
years = ['2015', '2016', '2017', '2018', '2019', '2020']

In [3]:
def get_merged_df_from_csv_files(path):

  files_names = os.listdir(path)

  df_list = []
  for i in range(len(files_names)):
    df = pd.read_csv(path+files_names[i])
    columns = df.columns
    df = df.rename(columns={
        columns[0]:columns[0]+'_'+years[i],
        columns[1]:columns[1]+'_'+years[i]
    })
    df_list.append(df)

  merged_df = reduce(lambda left, right: pd.merge(
      left, right, on=['centroid'], how='outer'), df_list)

  return merged_df

In [4]:
train_df = get_merged_df_from_csv_files(train_path)

In [5]:
train_df = train_df.reindex(columns=[
    'centroid', 'CODE_CULTU_2015', 'CODE_GROUP_2015', 'CODE_CULTU_2016',
    'CODE_GROUP_2016', 'CODE_CULTU_2017', 'CODE_GROUP_2017',
    'CODE_CULTU_2018', 'CODE_GROUP_2018', 'CODE_CULTU_2019',
    'CODE_GROUP_2019'])

In [6]:
test_2019_df = get_merged_df_from_csv_files(test_2019_path)

In [7]:
test_2019_df = test_2019_df.reindex(columns=[
    'centroid', 'CODE_CULTU_2015', 'CODE_GROUP_2015', 'CODE_CULTU_2016',
    'CODE_GROUP_2016', 'CODE_CULTU_2017', 'CODE_GROUP_2017',
    'CODE_CULTU_2018', 'CODE_GROUP_2018'])

In [8]:
train_part = train_df.iloc[:3600]

In [9]:
train_part.tail()

Unnamed: 0,centroid,CODE_CULTU_2015,CODE_GROUP_2015,CODE_CULTU_2016,CODE_GROUP_2016,CODE_CULTU_2017,CODE_GROUP_2017,CODE_CULTU_2018,CODE_GROUP_2018,CODE_CULTU_2019,CODE_GROUP_2019
3595,Point (847960.90537215 6582819.57682417),NVT,25,MIS,2,NVT,25,SNE,28,SNE,28
3596,Point (848694.18482885 6583204.68504862),CMB,25,PTC,25,ORH,3,SNE,28,SNE,28
3597,Point (848661.59276403 6583147.24438944),PTC,25,MIS,2,ORH,3,SNE,28,SNE,28
3598,Point (854694.36657987 6569838.61993557),BTH,1,ORH,3,BTH,1,MIE,2,ORH,3
3599,Point (854680.93860896 6571477.30489434),MIS,2,MIE,2,MIS,2,MIE,2,BTH,1


In [10]:
reg_data = gpd.read_file('fr_geojson/fr_reg.geojson')

In [11]:
reg_data.tail()

Unnamed: 0,code,nom,geometry
13,6,Mayotte,"MULTIPOLYGON (((45.04101 -12.64686, 45.03392 -..."
14,76,Occitanie,"MULTIPOLYGON (((1.78613 42.57362, 1.78094 42.5..."
15,84,Auvergne-Rhône-Alpes,"POLYGON ((3.36135 44.97141, 3.35453 44.95482, ..."
16,93,Provence-Alpes-Côte d'Azur,"MULTIPOLYGON (((6.94833 44.65482, 6.95451 44.6..."
17,94,Corse,"MULTIPOLYGON (((9.27103 41.36496, 9.26570 41.3..."


In [12]:
def get_roration_plan(seq):
    plan = ''
    usedplanValues = {}
    planValue = 'a'
    for el in seq:
        if el in usedplanValues:
            plan += usedplanValues[el]
        else:
            plan += planValue
            usedplanValues[el] = planValue
            planValue = chr(ord(planValue) + 1)
    return plan

def provide_culture_and_culture_group_encodings(df):
    df['roration_plan'] = df.apply(lambda row: get_roration_plan([row[column] for column in df.columns if column.startswith('plan_GROUP_')]), axis=1)

In [13]:
provide_culture_and_culture_group_encodings(train_part)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cultuCode'] = df.apply(lambda row: seqToCode([row[column] for column in df.columns if column.startswith('CODE_CULTU_')]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groupCode'] = df.apply(lambda row: seqToCode([row[column] for column in df.columns if column.startswith('CODE_GROUP_')]), axis=1)


In [14]:
groupCode_popularity = train_part.groupby(by='groupCode')['groupCode'].count().sort_values(ascending=False)

In [15]:
cultuCode_popularity = train_part.groupby(by='cultuCode')['cultuCode'].count().sort_values(ascending=False)

In [16]:
groupCode_popularity

groupCode
aaaaa    2062
abbbb     125
ababa      79
abccc      72
aaabb      70
aaaab      56
abaca      50
abcab      49
aabbb      47
aaabc      44
aabaa      44
abaaa      43
abcdd      43
abcba      42
abcbd      40
aaaba      38
abcdb      32
ababc      30
abcad      30
aabcd      29
abcaa      28
abaac      28
abbcd      27
abbab      27
abacc      26
aabcb      24
aabab      23
abbbc      23
abaab      22
abcac      21
aabca      20
abacd      20
abccb      18
abbcc      18
abcde      17
abbaa      17
abcdc      16
abcda      16
abbcb      16
aabcc      16
ababb      16
abcbc      15
abacb      15
abbac      15
abcbb      14
abbca      13
abcca      12
aabac      12
abccd      11
aabbc      11
abbba      10
aabba       8
Name: groupCode, dtype: int64

In [17]:
cultuCode_popularity

cultuCode
aaaaa    1877
abbbb     188
aabbb      86
abccc      78
ababa      62
aaabb      57
abaaa      56
abcde      56
abcbd      53
abcdd      52
aaaab      51
abaca      51
abbcd      46
aaabc      46
abcab      46
abcdb      38
abcba      38
aabcd      37
aabaa      36
abcad      35
ababc      31
aaaba      30
abacd      30
abcdc      30
abccd      26
abaac      25
abacc      25
abbbc      24
abcaa      23
abbab      22
aabcb      22
abccb      21
aabca      20
abcac      20
aabab      20
abbcb      19
abcda      19
aabcc      19
abaab      19
abbcc      17
abcbc      16
abacb      16
abbac      15
aabac      13
abcca      13
abcbb      13
abbaa      12
aabbc      12
abbca      11
ababb      11
abbba      10
aabba       7
Name: cultuCode, dtype: int64

In [18]:
train_part.describe(include='object')

Unnamed: 0,centroid,CODE_CULTU_2015,CODE_CULTU_2016,CODE_CULTU_2017,CODE_CULTU_2018,CODE_CULTU_2019,cultuCode,groupCode
count,3600,3600,3600,3600,3600,3600,3600,3600
unique,3600,46,55,57,61,53,52,52
top,Point (907753.82834152 6554634.46488151),PPH,PPH,PPH,PPH,PPH,aaaaa,aaaaa
freq,1,1589,1677,1716,1737,1735,1877,2062


In [19]:
train_part['crs_centroid'] = gpd.GeoSeries.from_wkt(train_part['centroid'], crs='epsg:2154').to_crs('epsg:4326')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_part['crs_centroid'] = gpd.GeoSeries.from_wkt(train_part['centroid'], crs='epsg:2154').to_crs('epsg:4326')


In [20]:
def get_region_name(point):

  try:
    region = reg_data.nom[reg_data.geometry.intersects(point)].values[0]
    return region
  except:
    return 'unknown'

In [25]:
train_part['region'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_part['region'] = 0


In [26]:
# !nohup python

start_time = time.time()

for i in range(0, train_part.shape[0]):
  train_part['region'][i] = get_region_name(train_part['crs_centroid'][i])

# train_part[['region']].to_csv('/content/drive/MyDrive/SkillFactory/CropRotationAnalysis/data/region_part1', index=False)

end_time = time.time()
delta = end_time - start_time
print(delta)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_part['region'][i] = get_region_name(train_part['crs_centroid'][i])


10.345516204833984


In [29]:
start_time = time.time()


train_part['region'] = train_part['crs_centroid'].apply(get_region_name)

# train_part[['region']].to_csv('/content/drive/MyDrive/SkillFactory/CropRotationAnalysis/data/region_part1', index=False)

end_time = time.time()
delta = end_time - start_time
print(delta)


9.507605791091919


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_part['region'] = train_part['crs_centroid'].apply(get_region_name)
