# Lab 03. Weather Predict Competition

This lab consists of only one task: train a model to predict if it is going to rain the next day:
- Classification metric is **roc-auc**
- Score should be calculated on a hold-out set (don't use it for training)


#### Evaluation
- Score > 0.860 - 3 points
- Score > 0.865 - 5 points
- Score > 0.870 - 7 points
- Score > 0.875 - 9 points
- Score > 0.880 - 11 points
- Score > 0.885 - 13 points
- Score > 0.890 - 15 points
- You are encouraged to use various methods and tricks to get extra points

#### Do not use any external data or models that were not covered in the course (no gradient boosting and  neural nets).


#### How to submit
- Name your file according to this convention: `2022_lab03_GroupNumber_Surname_Name.ipynb`, for example 
    - `2022_lab03_404_Sheipak_Sviat.ipynb`
    - `2022_lab03_NoGroup_Sheipak_Sviat.ipynb`
- Attach your .ipynb to an email with topic `2022_lab02_GroupNumber_Surname_Name`
- Send it to `cosmic.research.ml@yandex.ru`
- Deadline is ` 22022-11-03 23:00:00 +03:00`

#### The Data:
- All the datasets you need are here: https://github.com/cosmic-research-ml-edu/intro_ml_2022/blob/main/homeworks/hw03/train.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
db_train = pd.read_csv("holdout_data/train.csv")
db_test = pd.read_csv("holdout_data/holdout.csv")

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [4]:
target_col = "raintomorrow"
id_cols = ["row_id"]

In [5]:
Y_train = db_train[target_col].values
db_train.drop(columns=[target_col], inplace=True)

Y_test = db_test[target_col].values
db_test.drop(columns=[target_col], inplace=True)

In [6]:
numeric_cols = [x for x in db_train.select_dtypes(include=np.number).columns.tolist() if x not in id_cols]

In [7]:
numeric_cols

['mintemp',
 'maxtemp',
 'rainfall',
 'evaporation',
 'sunshine',
 'windgustspeed',
 'windspeed9am',
 'windspeed3pm',
 'humidity9am',
 'humidity3pm',
 'pressure9am',
 'pressure3pm',
 'cloud9am',
 'cloud3pm',
 'temp9am',
 'temp3pm']

In [8]:
db_train.head()

Unnamed: 0,row_id,mintemp,maxtemp,rainfall,evaporation,sunshine,windgustdir,windgustspeed,winddir9am,winddir3pm,...,windspeed3pm,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,raintoday
0,60284bd679027a1a,13.4,22.9,0.6,,,W,44.0,W,WNW,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No
1,5af49400eeec47eb,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No
2,367cbc10c57ff320,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No
3,f9841359d0350bc8,14.6,29.7,0.2,,,WNW,56.0,W,W,...,24.0,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No
4,5f4d2848f7ee131a,7.7,26.7,0.0,,,W,35.0,SSE,W,...,17.0,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No


In [131]:
db_train2 = db_train.copy()

In [132]:
wind_dir = ['E', 'ENE', 'NE', 'NNE', 'N', 'NNW', 'NW', 'WNW', 'W', 'WSW', 'SW', 'SSW', 'S', 'SSE', 'SE', 'ESE']


def find_dir(arr, dir1, dir2):
    ans = 0
    try:
        start = arr.index(dir1)
    except ValueError:
        ans = None
        return ans
    if dir2 not in arr:
        ans = None
        return ans
    for i in range(1, len(arr)):
        # print(arr[start-i], arr[start+i])
        left_i = start-i
        right_i = start+i
        if right_i < len(arr):
            if arr[right_i]==dir2:
                ans = i
                break
        if left_i >= -len(arr):
            if arr[left_i]==dir2:
                ans = i
                break
    return ans

In [133]:
# db_train['winddirdif'] = db_train['winddir3pm'].astype('float64') - db_train['winddir9am'].astype('float64')
db_train2['windspeeddif'] = db_train2['windspeed9am'].astype('float64') - db_train2['windspeed3pm'].astype('float64')
# db_train2['humiditydif'] = db_train2['humidity9am'].astype('float64') - db_train2['humidity3pm'].astype('float64')
db_train2['pressuredif'] = db_train2['pressure9am'].astype('float64') - db_train2['pressure3pm'].astype('float64')
# db_train2['tempdif'] = db_train2['temp9am'].astype('float64') - db_train2['temp3pm'].astype('float64')
# db_train2['maxmintempdif'] = db_train2['maxtemp'].astype('float64') - db_train2['mintemp'].astype('float64')
db_train2['winddirdif'] = [find_dir(wind_dir, x, y) for x,y in zip(db_train2['winddir9am'], db_train2['winddir3pm'])]
db_train2['windgustdir'] = [wind_dir.index(x) if pd.notna(x) else None for x in db_train2['windgustdir']]

In [134]:
db_train2.head()

Unnamed: 0,row_id,mintemp,maxtemp,rainfall,evaporation,sunshine,windgustdir,windgustspeed,winddir9am,winddir3pm,...,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,raintoday,windspeeddif,pressuredif,winddirdif
0,60284bd679027a1a,13.4,22.9,0.6,,,8.0,44.0,W,WNW,...,1007.7,1007.1,8.0,,16.9,21.8,No,-4.0,0.6,1.0
1,5af49400eeec47eb,7.4,25.1,0.0,,,7.0,44.0,NNW,WSW,...,1010.6,1007.8,,,17.2,24.3,No,-18.0,2.8,4.0
2,367cbc10c57ff320,17.5,32.3,1.0,,,8.0,41.0,ENE,NW,...,1010.8,1006.0,7.0,8.0,17.8,29.7,No,-13.0,4.8,5.0
3,f9841359d0350bc8,14.6,29.7,0.2,,,7.0,56.0,W,W,...,1009.2,1005.4,,,20.6,28.9,No,-5.0,3.8,0.0
4,5f4d2848f7ee131a,7.7,26.7,0.0,,,8.0,35.0,SSE,W,...,1013.4,1010.1,,,16.3,25.5,No,-11.0,3.3,5.0


In [137]:
#drop_list = ['windgustdir']
#db_train2.drop(columns=drop_list, inplace=True)
db_train2['raintoday'] = [1 if x == 'Yes' else 0 for x in db_train2['raintoday']]
db_train2.head()

Unnamed: 0,row_id,mintemp,maxtemp,rainfall,evaporation,sunshine,windgustdir,windgustspeed,winddir9am,winddir3pm,...,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,raintoday,windspeeddif,pressuredif,winddirdif
0,60284bd679027a1a,13.4,22.9,0.6,,,8.0,44.0,W,WNW,...,1007.7,1007.1,8.0,,16.9,21.8,0,-4.0,0.6,1.0
1,5af49400eeec47eb,7.4,25.1,0.0,,,7.0,44.0,NNW,WSW,...,1010.6,1007.8,,,17.2,24.3,0,-18.0,2.8,4.0
2,367cbc10c57ff320,17.5,32.3,1.0,,,8.0,41.0,ENE,NW,...,1010.8,1006.0,7.0,8.0,17.8,29.7,0,-13.0,4.8,5.0
3,f9841359d0350bc8,14.6,29.7,0.2,,,7.0,56.0,W,W,...,1009.2,1005.4,,,20.6,28.9,0,-5.0,3.8,0.0
4,5f4d2848f7ee131a,7.7,26.7,0.0,,,8.0,35.0,SSE,W,...,1013.4,1010.1,,,16.3,25.5,0,-11.0,3.3,5.0


In [138]:
numeric_cols2 = [x for x in db_train2.select_dtypes(include=np.number).columns.tolist() if x not in id_cols]
numeric_cols2

['mintemp',
 'maxtemp',
 'rainfall',
 'evaporation',
 'sunshine',
 'windgustdir',
 'windgustspeed',
 'windspeed9am',
 'windspeed3pm',
 'humidity9am',
 'humidity3pm',
 'pressure9am',
 'pressure3pm',
 'cloud9am',
 'cloud3pm',
 'temp9am',
 'temp3pm',
 'raintoday',
 'windspeeddif',
 'pressuredif',
 'winddirdif']

In [139]:
db_test2 = db_test.copy()

In [140]:
db_test2['windspeeddif'] = db_test2['windspeed9am'].astype('float64') - db_test2['windspeed3pm'].astype('float64')
# db_test2['humiditydif'] = db_test2['humidity9am'].astype('float64') - db_test2['humidity3pm'].astype('float64')
db_test2['pressuredif'] = db_test2['pressure9am'].astype('float64') - db_test2['pressure3pm'].astype('float64')
# db_test2['tempdif'] = db_test2['temp9am'].astype('float64') - db_test2['temp3pm'].astype('float64')
# db_test2['maxmintempdif'] = db_test2['maxtemp'].astype('float64') - db_test2['mintemp'].astype('float64')
db_test2['winddirdif'] = [find_dir(wind_dir, x, y) for x,y in zip(db_test2['winddir9am'], db_test2['winddir3pm'])]
db_test2['windgustdir'] = [wind_dir.index(x) if pd.notna(x) else None for x in db_test2['windgustdir']]
# db_test2.drop(columns=drop_list, inplace=True)
db_test2['raintoday'] = [1 if x == 'Yes' else 0 for x in db_test2['raintoday']]
db_test2.head()

Unnamed: 0,row_id,mintemp,maxtemp,rainfall,evaporation,sunshine,windgustdir,windgustspeed,winddir9am,winddir3pm,...,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,raintoday,windspeeddif,pressuredif,winddirdif
0,fb4157edd320e344,10.7,12.4,0.4,,,10.0,17.0,,WNW,...,1028.2,1025.8,,,11.1,11.6,0,-6.0,2.4,
1,872bf995fecb912e,5.6,18.1,2.6,,,9.0,31.0,NW,SW,...,1027.0,1024.9,,,10.8,17.3,1,-15.0,2.1,4.0
2,568ae84628a4386e,5.1,17.5,0.0,,,1.0,39.0,ENE,E,...,1026.9,1022.8,,,10.0,16.8,0,0.0,4.1,1.0
3,5511c87e12132b3b,10.0,27.8,0.0,,,6.0,76.0,N,W,...,1010.1,1003.3,,,16.6,21.8,0,-25.0,6.8,4.0
4,f12cb40d27a45c13,6.4,15.8,0.0,,,8.0,54.0,WSW,W,...,1011.0,1010.1,,,11.3,14.7,0,-13.0,0.9,1.0


In [141]:
X_train = db_train2[numeric_cols2].fillna(-9999).values
X_test = db_test2[numeric_cols2].fillna(-9999).values

In [144]:
clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', random_state=42)

clf.fit(X_train, Y_train)
preds = clf.predict_proba(X_test)[:, 1]
roc_auc_score(Y_test, preds)
# 0.8590878798154512

# 0.861863336258164 with a lot of features                                          GOOD
# 0.862092605696118 + without maxmintempdif                                         GOOD
# 0.8629833524953056 + without tempdif                                              GOOD
# 0.8600982535791015 + without humiditydif, pressuredif                             BAD
# 0.8623302739666973 + with humiditydif, pressuredif but without windspeeddif       GOOD

#=========so that means that windspeeddif is important=========

# 0.8591545934897975 + with windspeeddif, humiditydif but without pressuredif       BAD
# 0.8638280056244203 + with windspeeddif, pressuredif but without humiditydif       GOOD
# 0.8629439688672613 + previous with tempdif                                        BAD
# 0.8635903051250129 + inverted timestamps                                          GOOD
# 0.8647268934256078 + previous without tempdif                                     GOOD
# 0.8647007042798234 + previous without windspeeddif                                BAD

# 0.8679150593505474 + criterion='entropy' or criterion='log_loss'                  GOOD
# 0.8675162018186754 + previous with humiditydif with inverted timestamps           BAD

#=====pressuredif good; tempdif, humiditydif bad; criterion is not gini======
# gotta try doing something with winddir
# 0.8681020896862799 + with winddirdif and n_estimators=500                         GOOD
# 0.8686126910112406 + with winddirdif and n_estimators=1000                        GOOD
# 0.8692190828593375 + previous with windgustdir and n_estimators=1000              GOOD

0.8692190828593375

In [18]:
# -------------------------------------THIS CODE BLOCK IS IRRELEVANT-----------------------------------------------------

# Best estimator RandomForestClassifier(n_estimators=300, random_state=42) with score 0.83795; roc_auc_score 0.8595294792192641
# Best estimator RandomForestClassifier(n_estimators=500, random_state=42) with score 0.83875; roc_auc_score 0.8601734950013861
# Best estimator RandomForestClassifier(n_estimators=700, random_state=42) with score 0.83909; roc_auc_score 0.8602839303040504
clf = RandomForestClassifier(n_estimators=500, random_state=42)

clf.fit(X_train, Y_train)
preds = clf.predict_proba(X_test)[:, 1]
roc_auc_score(Y_test, preds)

0.8601734950013861