In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, KFold

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score, cohen_kappa_score, precision_score, recall_score, f1_score

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import tree
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier


In [None]:
#Read/Importing data
df= pd.read_csv(r'FILENAME.csv')

In [None]:
#To Display all the rows and columns of table
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None) 

In [None]:
#Keeping copy of file for future 
df1= df.copy()

In [None]:
## Data Cleaning and Preprocessing

In [None]:
#To display top 5 rows of the table
df.head()

In [None]:
#To display number of rows and columns
df.shape

In [None]:
#To display columns, data types, non-null values
df.info()

In [None]:
#To display data types of columns
df.dtypes

In [None]:
#To display list of columns
df.columns

In [None]:
#To display number of missing values and treat them
df.isnull().sum()

In [None]:
#To check missing values in percentage
(df.isnull().sum()*100/df.isnull().count()).sort_values(ascending = False)

In [None]:
#To visualise null values
sns.heatmap(df.isnull(), cbar=False)
plt.show()

In [None]:
#Droping rows with less missing values
df.dropna(axis=0, inplace=True)

In [None]:
#Dropping columns with many missing values
df.drop(['Column1','Column2','Column3'], axis=1, inplace=True)

In [None]:
#To replace with specific values in one column.
df['column_name'].fillna(value, inplace=True)

In [None]:
#To replace missing values in one column with mean, median or mode.
data['column_name'].fillna(data['column_name'].mean(), inplace=True)

In [None]:
#To display number of duplicate values
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
#To display columns that are numeric and categorical
cat_cols = df.dtypes[df.dtypes=='object'].index
num_cols = df.dtypes[df.dtypes!='object'].index
print(cat_cols)
print(num_cols)

In [None]:
#To display values and its count for categorical columns
for i in cat_cols:
    print(f'Feature {i}')
    print(f'Unique Values - {df[i].nunique()}')
    print(f'Value Counts\n{df[i].value_counts()}')
    print('-'*40)

In [None]:
#### Other basic functions

In [None]:
# Change data type to int or float
df['column_name'] = df['column_name'].astype(int/float)

In [None]:
# Text based data

# remove spaces from start or end of the values.
df['column_name'] = df['column_name'].str.strip()

# Convert text to lowercase
df['column_name'] = df['column_name'].str.lower()

# Remove special characters from text
df['column_name'] = df['column_name'].str.replace('[^\w\s]', '')

# Tokenize text into words
df['column_name'] = df['column_name'].str.split()

In [None]:
# Date time data

# Convert string column to datetime format
df['date_column'] = pd.to_datetime(df['date_column'])

# Extract year from a datetime column
df['year'] = df['date_column'].dt.year

# Extract month from a datetime column
df['month'] = df['date_column'].dt.month

# Extract day from a datetime column
df['day'] = df['date_column'].dt.day

# Calculate time differences
df['time_diff'] = df['end_time'] - df['start_time']

In [None]:
#To rename column name
df.rename(columns={'Old_Column_Name':'New_Column_Name'},inplace=True) 

# To replace specific values in a column
df['column_name'].replace({'old_value': 'new_value'}, inplace=True)

In [None]:
#To remove a column from dataframe
df.drop('name',axis=1,inplace=True)

In [None]:
## EDA - Exploratory Data Analysis

In [None]:
#To display countplot of target variable
sns.countplot(x=df['target'])
plt.title('Countplot for Target')
plt.show()

In [None]:
#To display basic statistical values for numeric columns
df.describe()

In [None]:
# Univariate analysis
# Distribution of numeric independent variables- dist plot, histogram, boxplot
# Distribution of categorical independent variables-countplot, pie chart

In [None]:
#To display box plot for all numeric columns
for i in num_cols:
    sns.boxplot(y=df[i])
    plt.title(f'Boxplot for {i}')
    plt.show()

In [None]:
#To display histogram for all numeric columns
for i in num_cols:
    sns.histplot(x=df[i])
    plt.title(f'Boxplot for {i}')
    plt.show()

In [None]:
##To display count plot for all categorical columns
for i in cat_cols:
    sns.countplot(x=df[i])
    plt.title(f'Countplot for {i}')
    plt.show()

In [None]:
#To display pie chart for all categorical columns
for i in cat_cols:
    count = df[i].value_counts()
    labels = count.index.tolist()
    sizes = count.values.tolist()
    plt.figure()
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.title(f'Pie chart for {i}')
    plt.show()

In [None]:
# Bi-Variate Analysis
# num-num : Scatterplot, pairplot
# num-cat : T-test, ANOVA, boxplot
# cat-cat : Chi-Square test

In [None]:
#To display pair plot between two numeric columns
sns.pairplot(df,vars=num_cols)  
plt.show()

In [None]:
#To display box plot between numeric and categorical columns
for i in cat_cols:
    sns.boxplot(x=df[i],y=df['Dependent_column'])
    plt.title(f'Boxplot of {i} VS Dependent_column')
    plt.show()

In [None]:
# Scatter plot of two numeric columns
plt.scatter(df['numeric_column1'], df['numeric_column2'])
plt.xlabel('Numeric Column 1')
plt.ylabel('Numeric Column 2')
plt.show()

In [None]:
# Box plot of a numeric column by a categorical column
sns.boxplot(df['categorical_column'], df['numeric_column'])
plt.xlabel('Categorical Column')
plt.ylabel('Numeric Column')
plt.show()

In [None]:
# Cross-tabulation between two categorical columns
cross_tab = pd.crosstab(df['categorical_column1'], df['categorical_column2'])
print(cross_tab)

In [None]:
#Multi-variate Analysis

In [None]:
#To display correlation between different columns
corr = df.corr()
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot=True, cmap='RdBu')
plt.show()

In [None]:
#To display columns that have high correlation
plt.figure(figsize=(25,25))
sns.heatmap(corr[abs(corr)>=0.7], annot=True, cmap='RdBu')
plt.show()

In [None]:
#  Inference

In [None]:
# Outlier Treatment

In [None]:
# Remove rows based on a condition
df = df[df['column_name'] != 'value_to_remove']

In [None]:
# Remove outliers based on domain knowledge or business rules
df = df[(df['column_name'] >= lower_threshold) & (df['column_name'] <= upper_threshold)]

In [None]:
df.describe(percentiles=[0.01,0.03,0.05,0.10,0.90,0.95,0.97,0.99]).T

In [None]:
def out_treat(x):
    x = x.clip(upper = x.quantile(0.97))
    x = x.clip(lower = x.quantile(0.01))
    return x

In [None]:
num_cols = df.dtypes[df.dtypes!='object'].index
df[num_cols] = df[num_cols].apply(out_treat)

In [None]:
# Removing columns with correlation more than 90%

In [None]:
def high_corr_feat(corr,thresh):
    res = []
    for i in range(len(corr.columns)): # i = 0,1,2..,29
        for j in range(i):             # i=0,j=0,  i=1,j=0
            if corr.iloc[i,j] >= thresh:
                res.append(corr.columns[i])
    res = list(set(res))
    return res
    
feat_to_drop = high_corr_feat(corr,0.9)
print(feat_to_drop)

In [None]:
df.drop(feat_to_drop,inplace=True,axis=1)

In [None]:
### Encoding categorical Columns

In [None]:
cat_cols

In [None]:
df = pd.get_dummies(df,columns=cat_cols,drop_first=True)
print(df.shape)
print(df.dtypes)

In [None]:
#### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [None]:
cat_cols = df.dtypes[df.dtypes=='object'].index
print(cat_cols)

In [None]:
for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [None]:
# Get Dummies - OneHotEncoding

In [None]:
df.Column_name = df.Column_name.replace({'Column_values1':'1', 'Column_values2':'2'})

In [None]:
df = pd.get_dummies(df, columns = ['Column_name'],prefix = 'Column_c')

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()
res_ohe = ohe.fit_transform(df[['Column_Name']]).toarray()
res_ohe_df = pd.DataFrame(res_ohe,columns=['Column_values1','Column_values2'])
res_ohe_df.head()

In [None]:
# Standardise data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data_selected)
data_standardised = scaler.fit_transform(data_selected)

In [None]:
# Model building

In [None]:
# Creating Function to evalute the model performance

In [None]:
def eval_model(model,x_train,x_test,y_train,y_test):
    global train_score, test_score, precision, recall, acc, kappa, f1

    model.fit(x_train,y_train)
    train_score = model.score(x_train,y_train)
    test_score = model.score(x_test,y_test)
    y_pred = model.predict(x_test)

    acc = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    kappa=cohen_kappa_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('\nTraining Score is:', round(train_score,4)*100, '%')
    print('Testing Score is:', round(test_score,4)*100, '%\n')

    print('Accuracy_Score is', round(acc,4))
    print('precision Score is:', round(precision,4))
    print('Recall Score is:', round(recall,4))
    print('kappa Score is:', round(kappa,4))
    print('F1 Score is:', round(f1,4))

    cm = confusion_matrix(y_test,y_pred)
    print('\nConfusion Matrix is\n', cm)

    print('\nPredictions:\n',y_pred)
    print(classification_report(y_test,y_pred))

    plt.figure(figsize = (8,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="RdBu", cbar=False)
    plt.show()

In [None]:
def plot_roc_auc(model,y_test,ypred_proba):
    global auc
    fpr,tpr,thresh = roc_curve(y_test,ypred_proba[:,1])
    auc = roc_auc_score(y_test,ypred_proba[:,1])
    print('AUC:',auc)
    plt.plot(fpr,tpr)
    plt.plot([0,1],[0,1],label='TPR=FPR',color='red')
    plt.xlabel('FPR',fontsize=15)
    plt.ylabel('TPR',fontsize=15)
    plt.title('ROC Curve',fontsize=16)
    plt.legend()
    plt.show()
    return fpr,tpr,thresh

In [None]:
score_card = pd.DataFrame(columns=['Model', 'Train Score', 'Test Score', 'AUC Score' , 'Precision Score', 
                                   'Recall Score', 'Accuracy Score', 
                                   'Kappa Score', 'f1-score'])

def update_score_card(model_name):
    global score_card
    score_card = score_card.append({'Model': model_name, 'Train Score': train_score, 
                                    'Test Score': test_score,
                                     'AUC Score' : auc,
                                    'Precision Score': precision, 
                                    'Recall Score': recall, 'Accuracy Score': acc, 
                                    'Kappa Score': kappa, 'f1-score': f1}, ignore_index = True)
    return(score_card)

In [None]:
#Select the Dependent and Independent Features
x = df.drop('Dependent_column',axis=1)
y = df['Dependent_column']
print(type(x),type(y))
print(x.shape,y.shape)

In [None]:
#Split the data into train and test set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1000)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=10000)
eval_model(lr,x_train,x_test,y_train,y_test)

In [None]:
### ROC_AUC Curve

In [None]:
ypred_proba_lr = lr.predict_proba(x_test)
fpr_lr1,tpr_lr1,thresh_lr1 = plot_roc_auc(lr,y_test,ypred_proba_lr)

In [None]:
#### Creating tabulated data for evaluation matrix

In [None]:
update_score_card(model_name = 'Logistic Regression')

In [None]:
#### Logistic Regression (using SGD)

In [None]:
lr_SGD = SGDClassifier(loss = 'log', random_state = 10)
eval_model(lr_SGD,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_lr_SGD = lr.predict_proba(x_test)
fpr_lr2,tpr_lr2,thresh_lr2 = plot_roc_auc(lr_SGD,y_test,ypred_proba_lr_SGD)

In [None]:
update_score_card(model_name = 'Logistic Regression using SGD')

In [None]:
# Decision Tree Classifier

In [None]:
dt1 = DecisionTreeClassifier(criterion='gini')
eval_model(dt1,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_dt1 = dt1.predict_proba(x_test)

In [None]:
fpr_dt1,tpr_dt1,thresh_dt1 = plot_roc_auc(dt1,y_test,ypred_proba_dt1)

In [None]:
update_score_card(model_name = 'Decision Tree Classifier')

In [None]:
# Decision Tree with Pruning

In [None]:
# Hyperparameter Tuning Techniques

# GridSearchCV
# 1) Searches for all possible permutations and combination of hyperparameters and then generates the best hyperparameters.
# 2) High time complexity

# RandomizedSarchCV
# 1) Searches for some random combinations of hyperparameters and then generates the best parameters from amongst the randomly chosen combinations.
# 2) Low time complexity


In [None]:
tuned_paramaters = [ {'criterion':['gini','entropy'],'max_depth':[8,9,10,11,12,14],
'min_samples_split':[8,10,12,14,15,16,20]}]

dt_CV= DecisionTreeClassifier(random_state = 10)

grid = GridSearchCV(estimator = dt_CV, param_grid = tuned_paramaters, cv = 10)

dt_grid = grid.fit(x_train, y_train)

print('Best parameters for Decision Tree Classifier: ', dt_grid.best_params_, '\n')

In [None]:
dt2 = DecisionTreeClassifier(criterion='entropy/gini',max_depth=, min_samples_split=) # Enter values from best_params
eval_model(dt2,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_dt2 = dt2.predict_proba(x_test)

In [None]:
fpr_dt2,tpr_dt2,thresh_dt2 = plot_roc_auc(dt2,y_test,ypred_proba_dt2)

In [None]:
update_score_card(model_name = 'Decision Tree with Pruning')

In [None]:
# Ensemble Learning

In [None]:
# Bagging Classifier

In [None]:
bag = BaggingClassifier(tree.DecisionTreeClassifier(random_state=10))
eval_model(bag,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_bag = bag.predict_proba(x_test)

In [None]:
fpr_bag,tpr_bag,thresh_bag = plot_roc_auc(bag,y_test,ypred_proba_bag)

In [None]:
update_score_card(model_name = 'Bagging Ensemble')

In [None]:
# AdaBoost Classifier

In [None]:
adaboost = AdaBoostClassifier(tree.DecisionTreeClassifier(random_state=10))
eval_model(adaboost,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_adaboost = adaboost.predict_proba(x_test)

In [None]:
fpr_adaboost,tpr_adaboost,thresh_adaboost = plot_roc_auc(adaboost,y_test,ypred_proba_adaboost)

In [None]:
update_score_card(model_name = 'Adaboost Ensemble')

In [None]:
# XGBoost Classifier

In [None]:
xgbm = XGBClassifier(random_state=1,learning_rate=0.01)
eval_model(xgbm,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_xgbm = xgbm.predict_proba(x_test)

In [None]:
fpr_xgbm,tpr_xgbm,thresh_xgbm = plot_roc_auc(xgbm,y_test,ypred_proba_xgbm)

In [None]:
update_score_card(model_name = 'XG Boost Ensemble')

In [None]:
# Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=10)
eval_model(rf,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_rf = rf.predict_proba(x_test)

In [None]:
fpr_rf,tpr_rf,thresh_rf = plot_roc_auc(rf,y_test,ypred_proba_rf)

In [None]:
update_score_card(model_name = 'Random Forest')

In [None]:
# Random Forest with GridSearchCV

In [None]:
tuned_paramaters = [{'n_estimators': [ 85, 100],
                     'min_samples_split': [15,20], 
                     'max_depth': [8, 10], 
                     'min_samples_leaf': [5,10], 
                     'max_leaf_nodes': [10, 15] }]

rf_CV= RandomForestClassifier(random_state = 10)

grid = GridSearchCV(estimator = rf_CV, param_grid = tuned_paramaters, cv = 10)

rf_grid = grid.fit(x_train, y_train)

print('Best parameters for random forest Classifier: ', rf_grid.best_params_, '\n')

In [None]:
rf2 = RandomForestClassifier(max_depth=N, 
                            max_leaf_nodes=N, 
                            min_samples_leaf=N,
                            min_samples_split=N,
                            n_estimators=N)
eval_model(rf2,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_rf2 = rf2.predict_proba(x_test)

In [None]:
fpr_rf2,tpr_rf2,thresh_rf2 = plot_roc_auc(rf2,y_test,ypred_proba_rf2)


In [None]:
update_score_card(model_name = 'Random Forest with Pruning')

In [None]:
# Naive Bayes Classifier

In [None]:
gnb = GaussianNB()
eval_model(gnb,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_gnb = gnb.predict_proba(x_test)

fpr_gnb,tpr_gnb,thresh_gnb = plot_roc_auc(gnb,y_test,ypred_proba_gnb)

In [None]:
update_score_card(model_name = 'Naive Bayes Classifier')

In [None]:
# KNN Classifier

In [None]:
knn1 = KNeighborsClassifier(n_neighbors=9)
eval_model(knn1,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_knn1 = knn1.predict_proba(x_test)

fpr_knn1,tpr_knn1,thresh_knn1 = plot_roc_auc(knn1,y_test,ypred_proba_knn1)

In [None]:
update_score_card(model_name = 'KNN')

In [None]:
# To find optimal number of neighbors for KNN

In [None]:
neighbors = list(range(3,51,2))
knn_acc = []
for i in neighbors:
    m = KNeighborsClassifier(n_neighbors=i)
    m.fit(x_train,y_train)
    ypred = m.predict(x_test)
    knn_acc.append(accuracy_score(y_test,ypred))

In [None]:
plt.plot(neighbors,knn_acc,color='maroon',label='Test Acc',marker='o')
plt.title('Test Accuracy vs No of Neighbors - KNN')
plt.xlabel('No of Neighbors')
plt.ylabel('Test Accuracy')
plt.xticks(neighbors)
plt.legend()
plt.grid()
plt.show()

In [None]:
knn2 = KNeighborsClassifier(n_neighbors=N)
eval_model(knn2,x_train,x_test,y_train,y_test)

In [None]:
ypred_proba_knn2 = knn2.predict_proba(x_test)

fpr_knn2,tpr_knn2,thresh_knn2 = plot_roc_auc(knn2,y_test,ypred_proba_knn2)

In [None]:
update_score_card(model_name = 'KNN with tuned parameters')

In [None]:
# Support Vector Classifier

In [None]:
# SVM using linear kernel

In [None]:
sv1= SVC(kernel='linear')
eval_model(sv1,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector using Linear kernel')

In [None]:
# SVM using Gaussian kernel

In [None]:
sv_gaussian= SVC(kernel='rbf')
eval_model(sv_gaussian,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector using Gaussian kernel')

In [None]:
# SVM using Sigmoid kernel

In [None]:
sv_sigmoid= SVC(kernel='sigmoid')
eval_model(sv_sigmoid,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector using Sigmoid kernel')

In [None]:
# SVM using Polynomial kernel

In [None]:
sv_poly= SVC(kernel='poly')
eval_model(sv_poly,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector using Polynomial kernel')

In [None]:
# SVM using GridSearchCV

In [None]:
tuned_paramaters = [{ 'degree': [2, 4], 'gamma' : ['auto','scale' ], 'C': [0.5, 1] }]

svm_CV= SVC(random_state = 10)

grid = GridSearchCV(estimator=svm_CV , param_grid=tuned_paramaters, scoring='accuracy', cv= 5)

svm_grid = grid.fit(x_train, y_train)

print('Best parameters for Decision Tree Classifier: ', svm_grid.best_params_, '\n')

In [None]:
sv_poly_cv= SVC(kernel='poly', C=N, degree=N, gamma='scale/auto')
eval_model(sv_poly_cv,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector Polynomial with Grid search CV')

In [None]:
# Model Comparision

In [None]:
score_card

In [None]:
score_card['Test Score'].idxmax()

In [None]:
print (score_card['Model'][score_card['Test Score'].idxmax()], "is the best performing model")

In [None]:
# #END