In [None]:
#for complex mathematical operations
import numpy as np 

#for dataframe manipulation
import pandas as pd

# for data visulisation
import seaborn as sns
import matplotlib.pyplot as plt

#for implementing complex machine learning algorithm use sklearn

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("newkpcbclasswise.csv", encoding = 'unicode_escape')

In [None]:
print("Shape of Dataset:" , data.shape)

In [None]:
data.head()

In [None]:
#Since missing values should not be present for a better machine learning ...find the missing value is present or not
data.isnull().sum()

In [None]:
data['CLASS'].value_counts()

In [None]:
from sklearn.utils import resample
# Separate majority and minority classes
# Separate majority and minority classes
data_majority = data[data.CLASS=='B']
data_minority_1 = data[data.CLASS=='A']
data_minority_2 = data[data.CLASS=='C']
data_minority_3 = data[data.CLASS=='D']
data_minority_4 = data[data.CLASS=='E']
data_minority_5 = data[data.CLASS=='Below E']

# Upsample minority class
data_minority_upsampled_1 = resample(data_minority_1, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results

data_minority_upsampled_2 = resample(data_minority_2, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results  

data_minority_upsampled_3 = resample(data_minority_3, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results                             


data_minority_upsampled_4 = resample(data_minority_4, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results  

data_minority_upsampled_5 = resample(data_minority_5, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results   
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled_1,data_minority_upsampled_2,data_minority_upsampled_3,data_minority_upsampled_4,data_minority_upsampled_5])

In [None]:
data_upsampled['CLASS'].value_counts()

In [None]:
# Remove label coloumn data from the dataset to avoid cheating in case of model formation, 
# and rest dataset is stored in x variable and label is stored in y variable

x=data_upsampled.drop(['CLASS','Source','Stations'],axis=1)
y=data_upsampled['CLASS']

print("Shape of x : ", x.shape)
print("Shape of y : ", y.shape)

In [None]:
# Creating training and testing dataset

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=13)

print("The shape of x train : ", x_train.shape)
print("The shape of x test : ", x_test.shape)
print("The shape of y train : ", y_train.shape)
print("The shape of y test : ", y_test.shape)

In [None]:
#Standardizing
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
#Importing confusion_matrix, accuracy_score, classification_report from sklearn.metrics library to obtain confusion matrix, accuracy score and classification report
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB

# Specifiy the model
model= GaussianNB()


# Fit the model to the data
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# Now accuracy of model needed to be checked by comparing y_test with y_pred
# Its done with the help classification report .... by importing it from sklearn library

#To print confusion matrix
plt.rcParams["figure.figsize"]=(5,5)
cm=confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Greens", center=0.6)
plt.title("Confusion Matrix for Naive Bayes", fontsize=15)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

#To check accuracy
print("Accuracy of the Naive Bayes Classifier model in % is:", round(accuracy_score(y_test, y_pred)*100,2))

#To print the Classification Report :
cr = classification_report(y_test,y_pred)
print(cr)

In [None]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train,y_train) #Training
y_pred = model.predict(x_test)

# Now accuracy of model needed to be checked by comparing y_test with y_pred
# Its done with the help classification report .... by importing it from sklearn library

#To print confusion matrix
plt.rcParams["figure.figsize"]=(5,5)
cm=confusion_matrix(y_test, y_pred)
#sns.heatmap(cm, annot=True, cmap='Wistia')
sns.heatmap(cm, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Scale'})
plt.title("Confusion Matrix for Decision Tree", fontsize=15)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

#To check accuracy
print("Accuracy of the Decision Tree Classifier model in % is:", round(accuracy_score(y_test, y_pred)*100,2))

#To print the Classification Report :
cr = classification_report(y_test,y_pred)
print(cr)

In [None]:


#Support Vector Machine Classifier
#from sklearn import svm
from sklearn.svm import SVC
model = SVC()
model.fit(x_train,y_train) #Training
y_pred = model.predict(x_test)
model.score(x_test,y_test)

# Now accuracy of model needed to be checked by comparing y_test with y_pred
# Its done with the help classification report .... by importing it from sklearn library

#To print confusion matrix
plt.rcParams["figure.figsize"]=(5,5)
cm=confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="inferno")
plt.title("Confusion Matrix for Support Vector Machine Classifier", fontsize=15)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

#To check accuracy
print("Accuracy of the Support Vector Machine model  in % is:", round(accuracy_score(y_test, y_pred)*100,2))

#To print the Classification Report :
cr = classification_report(y_test,y_pred)
print(cr)


In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train,y_train) #Training
y_pred = model.predict(x_test)
model.score(x_test,y_test)



# Now accuracy of model needed to be checked by comparing y_test with y_pred
# Its done with the help classification report .... by importing it from sklearn library

#To print confusion matrix
plt.rcParams["figure.figsize"]=(5,5)
cm=confusion_matrix(y_test, y_pred)
#sns.heatmap(cm, annot=True, cmap='Wistia')
#sns.heatmap(cm, annot=True,  vmin=0.0, vmax=100.0, fmt='.2f')
sns.heatmap(cm, annot=True, cmap="BuPu")
plt.title("Confusion Matrix for Random Forest Classifier", fontsize=15)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

#To check accuracy
print("Accuracy of the Random Forest Classifier model in % is:", round(accuracy_score(y_test, y_pred)*100,2))

#To print the Classification Report :
cr = classification_report(y_test,y_pred)
print(cr)