# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.decomposition import PCA

## Reading data

In [2]:
train = pd.read_csv("train.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temparature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.5 KB


In [3]:
train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


## Clearing data from Null values

In [4]:
train.columns = train.columns.str.replace(' ', '')
train = train[train.columns].copy()
train = train.fillna(method='bfill', axis = 1)

  train = train.fillna(method='bfill', axis = 1)


## Preprocessing data

In [5]:
train = train.drop(columns=['maxtemp', 'mintemp', 'day'])
train_data_X = train.copy()
train_data_y = train_data_X.pop('rainfall')


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_data_X, train_data_y,test_size=0.25, random_state=50)

In [7]:
min_max_sc = MinMaxScaler()
X_train_sc = min_max_sc.fit_transform(X_train)
X_test_sc = min_max_sc.transform(X_test)

In [8]:
X_test_sc

array([[0.20922796, 0.39325843, 0.76763485, ..., 0.01652893, 0.10344828,
        0.18330309],
       [0.33257195, 0.58146067, 0.36099585, ..., 0.        , 0.24137931,
        0.61524501],
       [0.41434445, 0.26404494, 0.94190871, ..., 0.63636364, 0.65517241,
        0.21960073],
       ...,
       [0.24577433, 0.25561798, 0.9626556 , ..., 0.16528926, 0.62068966,
        0.15245009],
       [0.82457743, 0.5       , 0.61410788, ..., 0.37190083, 0.24137931,
        0.38294011],
       [0.80356327, 0.55898876, 0.67634855, ..., 0.23966942, 0.13793103,
        0.3738657 ]], shape=(548, 9))

In [9]:
rca = PCA()
X_train_sc = rca.fit_transform(X_train_sc)
X_test_sc = rca.transform(X_test_sc)

In [10]:
test = pd.read_csv("test.csv")
test.columns = test.columns.str.replace(' ', '')
test = test[test.columns].copy()
test = test.fillna(method='bfill', axis = 1)
test = test.drop(columns=['maxtemp', 'mintemp', 'day'])
test = min_max_sc.transform(test)
test = rca.transform(test)

  test = test.fillna(method='bfill', axis = 1)


### Function for serching best model and hyperparameters

In [11]:
def training_model(model, params, train_data_X, train_data_y,test_data_X, test_data_y):
    grid_search = GridSearchCV(model, params, cv=5, refit = True)
    grid_search.fit(train_data_X, train_data_y)
    best_est_model = grid_search.best_estimator_
    best_est_model.fit(X_train_sc, y_train)
    predict_test = best_est_model.predict(test_data_X)
    acc = accuracy_score(test_data_y, predict_test)
    return  best_est_model, acc
    
    

In [12]:
models_list = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), GradientBoostingClassifier()]
log_reg_params = {
    "penalty":["l1", "l2", None],
    "solver":  ["saga"],
    "fit_intercept": [True, False],
    'max_iter': [100, 1000, 2500, 5000],
    
}
rand_for_params = {
    'n_estimators': [50, 100, 150, 200],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,15,20],
    'min_samples_leaf': [1, 2, 4],
    "max_leaf_nodes": [None, 2, 3, 4, 5]
}

knn_params = {
    'n_neighbors' : [5,7,9,11,13,15],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
gb_params = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': [0.001, 0.1, 1, 10],
    'n_estimators': [100, 150, 180, 200],
    'criterion' : ['friedman_mse', 'squared_error'],
    'max_depth': [None,5,10,15,20],
    'min_samples_leaf': [1, 2, 4]
}
params_list = [log_reg_params,knn_params, rand_for_params, gb_params]

### Learning

In [13]:
models_results = {}
for model,params in zip(models_list,params_list):
    est, acc = training_model(model,params,X_train_sc,y_train,X_test_sc, y_test)
    models_results[est.__class__.__name__] = (est,acc)

models_results
        

{'LogisticRegression': (LogisticRegression(penalty='l1', solver='saga'),
  0.8740875912408759),
 'KNeighborsClassifier': (KNeighborsClassifier(n_neighbors=13),
  0.864963503649635),
 'RandomForestClassifier': (RandomForestClassifier(max_depth=5, min_samples_leaf=4, n_estimators=50),
  0.8631386861313869),
 'GradientBoostingClassifier': (GradientBoostingClassifier(loss='exponential', max_depth=5),
  0.8631386861313869)}

In [14]:
full_data_mmsc = min_max_sc.transform(train_data_X)
full_data_mmsc_rc = rca.transform(full_data_mmsc)

In [16]:
best_model = models_results['LogisticRegression'][0]
best_model.fit(full_data_mmsc_rc,train_data_y)

In [17]:
preds = best_model.predict_proba(test)[:, 1]

In [18]:
sub = pd.read_csv('sample_submission.csv')
sub_pred = sub.copy()
sub_pred["rainfall"] = preds
sub_pred["rainfall"] = sub_pred["rainfall"].astype('float')

In [19]:
sub_pred.to_csv('submission_3.csv', index=False)