In [27]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
import seaborn as sb
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
import joblib # for saving algorithm and preprocessing objects

In [28]:
df = pd.read_csv("Rainfall.csv")

In [29]:
print(df)

     day  pressure   maxtemp  temparature  mintemp  dewpoint  humidity   \
0      1     1025.9     19.9         18.3     16.8      13.1         72   
1      2     1022.0     21.7         18.9     17.2      15.6         81   
2      3     1019.7     20.3         19.3     18.0      18.4         95   
3      4     1018.9     22.3         20.6     19.1      18.8         90   
4      5     1015.9     21.3         20.7     20.2      19.9         95   
..   ...        ...      ...          ...      ...       ...        ...   
361   27     1022.7     18.8         17.7     16.9      15.0         84   
362   28     1026.6     18.6         17.3     16.3      12.8         75   
363   29     1025.9     18.9         17.7     16.4      13.3         75   
364   30     1025.3     19.2         17.3     15.2      13.3         78   
365   31     1026.4     20.5         17.8     15.5      13.0         74   

     cloud  rainfall  sunshine           winddirection  windspeed  
0        49      yes       9.3 

In [30]:
df.isnull().sum() #shows the number of null values per column

day                       0
pressure                  0
maxtemp                   0
temparature               0
mintemp                   0
dewpoint                  0
humidity                  0
cloud                     0
rainfall                  0
sunshine                  0
         winddirection    1
windspeed                 1
dtype: int64

In [31]:
df.columns

Index(['day', 'pressure ', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity ', 'cloud ', 'rainfall', 'sunshine', '         winddirection',
       'windspeed'],
      dtype='object')

In [32]:
df.rename(str.strip, axis='columns', inplace=True) #removes extra spaces in columns

In [33]:
df.columns

Index(['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity', 'cloud', 'rainfall', 'sunshine', 'winddirection',
       'windspeed'],
      dtype='object')

In [34]:
for col in df.columns:
  
  # Checking if the column contains nulls
  if df[col].isnull().sum() > 0:
    # Sets null values equal to the mean of that column
    val = df[col].mean()
    df[col] = df[col].fillna(val)
# Checks to see if there are any nulls left
df.isnull().sum().sum()

np.int64(0)

In [35]:
features = df.drop(['day', 'rainfall'], axis=1)
target = df.rainfall

In [36]:
X_train, X_val, Y_train, Y_val = train_test_split(features,target,test_size=0.2, stratify=target,random_state=2)

# balance it by adding repetitive rows of minority class.
ros = RandomOverSampler(sampling_strategy='minority',
                        random_state=22)
X, Y = ros.fit_resample(X_train, Y_train)

In [37]:
# normalizes the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)

In [38]:
model = SVC(kernel='rbf', probability=True)

In [39]:
model.fit(X, Y)

print(f'{model} : ')

train_preds = model.predict_proba(X) 
print('Training Accuracy : ', metrics.roc_auc_score(Y, train_preds[:,1]))

val_preds = model.predict_proba(X_val) 
print('Validation Accuracy : ', metrics.roc_auc_score(Y_val, val_preds[:,1]))
print()

SVC(probability=True) : 
Training Accuracy :  0.9028054847099821
Validation Accuracy :  0.8958333333333333



In [42]:
# save svc algorithm
joblib.dump(model, "./svc.joblib", compress=True)

['./svc.joblib']