In [29]:
import numpy as np #import numpy
import pandas as pd #import pandas
import seaborn as sns # import seaborn
import matplotlib.pyplot as plt #import pyplot
from scipy.stats import pearsonr #for pearson's correlation

from sklearn.model_selection import train_test_split #for splitting the data in train and test
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler #for various scaling methods
from sklearn.linear_model import LogisticRegression #for LogisticRegression
from sklearn.naive_bayes import GaussianNB #for NaiveBayes
from sklearn.neighbors import KNeighborsClassifier #for KNN
from sklearn.svm import SVC #for Support vector classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score #for accuracy matrices
from sklearn.metrics import precision_score,classification_report,roc_auc_score,precision_score #for accuracy matrices


In [3]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import matplotlib.style as style
%matplotlib inline
style.use('ggplot')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

pd.pandas.set_option('display.max_columns',None)

In [42]:
df = pd.read_csv('classification_dataset.csv')
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
2,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
3,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0


In [5]:
df.columns

Index(['density_per_km', 'latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'visibility', 'winddirDegree', 'year', 'month', 'Target'],
      dtype='object')

In [6]:
df.shape

(155223, 19)

In [7]:
df.Target.value_counts()

0    89843
1    65380
Name: Target, dtype: int64

In [43]:
# Total no of duplicate rows
df.duplicated().sum()

79878

In [44]:
# Remove duplicate rows
df = df.drop_duplicates(keep='first', inplace=False)

# Method 1

In [31]:
feature_scale = [feature for feature in df.columns if feature in ['density_per_km','precipMM']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target','latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure',
       'visibility', 'winddirDegree', 'year', 'month']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

X = data[['latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure', 'visibility',
       'winddirDegree', 'year', 'month', 'density_per_km', 'precipMM']]

y = data['Target']

X = df.drop('Target',axis=1) #independent dimensions  
y = df['Target'] #selecting target column
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=1) #train test split in 50:50 ratio

# Logistic Regression
logic_r = LogisticRegression() 
logic_r.fit(X_train,y_train) 
log_y_pred = logic_r.predict(X_test)
LR_accuracy = accuracy_score(y_test,log_y_pred)
print('Accuracy of Logistic Regression :{:.2f}'.format(LR_accuracy))

# Naive bayes
NB = GaussianNB()   #Instantiate the Gaussian Naive bayes 
NB.fit(X_train,y_train) #Call the fit method of NB to train the model or to learn the parameters of model
NB_y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test,NB_y_pred)
print('Accuracy of Naive Bayes :{:.2f}'.format(NB_accuracy))

# Random forest
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
rand_y_pred=classifier.predict(X_test)
print('Accuracy of Random forest :{:.2f}'.format(accuracy_score(y_test,rand_y_pred)))

# Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
tree_y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree :{:.2f}'.format(accuracy_score(y_test,tree_y_pred)))

Accuracy of Logistic Regression :0.57
Accuracy of Naive Bayes :0.56
Accuracy of Random forest :0.44
Accuracy of Decision Tree :0.42


# Method 2 - MinMaxScaler

In [45]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

X = data[['latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure', 'visibility',
       'winddirDegree', 'year', 'month', 'density_per_km', 'precipMM']]

y = data['Target']

X = df.drop('Target',axis=1) #independent dimensions  
y = df['Target'] #selecting target column
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=1) #train test split in 50:50 ratio

# Logistic Regression
logic_r = LogisticRegression() 
logic_r.fit(X_train,y_train) 
log_y_pred = logic_r.predict(X_test)
LR_accuracy = accuracy_score(y_test,log_y_pred)
print('Accuracy of Logistic Regression :{:.2f}'.format(LR_accuracy))

# Naive bayes
NB = GaussianNB()   #Instantiate the Gaussian Naive bayes 
NB.fit(X_train,y_train) #Call the fit method of NB to train the model or to learn the parameters of model
NB_y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test,NB_y_pred)
print('Accuracy of Naive Bayes :{:.2f}'.format(NB_accuracy))

# Random forest
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
rand_y_pred=classifier.predict(X_test)
print('Accuracy of Random forest :{:.2f}'.format(accuracy_score(y_test,rand_y_pred)))

# Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
tree_y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree :{:.2f}'.format(accuracy_score(y_test,tree_y_pred)))

Accuracy of Logistic Regression :0.57
Accuracy of Naive Bayes :0.56
Accuracy of Random forest :0.44
Accuracy of Decision Tree :0.42


# Method 3 - StandardScaler

In [40]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=StandardScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

X = data[['latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure', 'visibility',
       'winddirDegree', 'year', 'month', 'density_per_km', 'precipMM']]

y = data['Target']

X = df.drop('Target',axis=1) #independent dimensions  
y = df['Target'] #selecting target column
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=1) #train test split in 50:50 ratio

# Logistic Regression
logic_r = LogisticRegression() 
logic_r.fit(X_train,y_train) 
log_y_pred = logic_r.predict(X_test)
LR_accuracy = accuracy_score(y_test,log_y_pred)
print('Accuracy of Logistic Regression :{:.2f}'.format(LR_accuracy))

# Naive bayes
NB = GaussianNB()   #Instantiate the Gaussian Naive bayes 
NB.fit(X_train,y_train) #Call the fit method of NB to train the model or to learn the parameters of model
NB_y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test,NB_y_pred)
print('Accuracy of Naive Bayes :{:.2f}'.format(NB_accuracy))

# Random forest
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
rand_y_pred=classifier.predict(X_test)
print('Accuracy of Random forest :{:.2f}'.format(accuracy_score(y_test,rand_y_pred)))

# Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
tree_y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree :{:.2f}'.format(accuracy_score(y_test,tree_y_pred)))

Accuracy of Logistic Regression :0.57
Accuracy of Naive Bayes :0.56
Accuracy of Random forest :0.44
Accuracy of Decision Tree :0.42


# Method 4 - RobustScaler

In [46]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=RobustScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

X = data[['latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure', 'visibility',
       'winddirDegree', 'year', 'month', 'density_per_km', 'precipMM']]

y = data['Target']

X = df.drop('Target',axis=1) #independent dimensions  
y = df['Target'] #selecting target column
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=1) #train test split in 50:50 ratio

# Logistic Regression
logic_r = LogisticRegression() 
logic_r.fit(X_train,y_train) 
log_y_pred = logic_r.predict(X_test)
LR_accuracy = accuracy_score(y_test,log_y_pred)
print('Accuracy of Logistic Regression :{:.2f}'.format(LR_accuracy))

# Naive bayes
NB = GaussianNB()   #Instantiate the Gaussian Naive bayes 
NB.fit(X_train,y_train) #Call the fit method of NB to train the model or to learn the parameters of model
NB_y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test,NB_y_pred)
print('Accuracy of Naive Bayes :{:.2f}'.format(NB_accuracy))

# Random forest
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
rand_y_pred=classifier.predict(X_test)
print('Accuracy of Random forest :{:.2f}'.format(accuracy_score(y_test,rand_y_pred)))

# Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
tree_y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree :{:.2f}'.format(accuracy_score(y_test,tree_y_pred)))

Accuracy of Logistic Regression :0.57
Accuracy of Naive Bayes :0.56
Accuracy of Random forest :0.44
Accuracy of Decision Tree :0.42


In [51]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

X = data[['latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure', 'visibility',
       'winddirDegree', 'year', 'month', 'density_per_km']]

y = data['Target']

X = df.drop('Target',axis=1) #independent dimensions  
y = df['Target'] #selecting target column
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=10) #train test split in 50:50 ratio

# Logistic Regression
logic_r = LogisticRegression() 
logic_r.fit(X_train,y_train) 
log_y_pred = logic_r.predict(X_test)
LR_accuracy = accuracy_score(y_test,log_y_pred)
print('Accuracy of Logistic Regression :{:.2f}'.format(LR_accuracy))

# Naive bayes
NB = GaussianNB()   #Instantiate the Gaussian Naive bayes 
NB.fit(X_train,y_train) #Call the fit method of NB to train the model or to learn the parameters of model
NB_y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test,NB_y_pred)
print('Accuracy of Naive Bayes :{:.2f}'.format(NB_accuracy))

# Random forest
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
rand_y_pred=classifier.predict(X_test)
print('Accuracy of Random forest :{:.2f}'.format(accuracy_score(y_test,rand_y_pred)))

# Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
tree_y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree :{:.2f}'.format(accuracy_score(y_test,tree_y_pred)))

Accuracy of Logistic Regression :0.56
Accuracy of Naive Bayes :0.56
Accuracy of Random forest :0.43
Accuracy of Decision Tree :0.42
