In [None]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

#Reading the csv file.
df=pd.read_csv("master_new.csv")
#analyzing the data
df.head()
df.count()

#changing column suicides/100k pop to 1 s and 0s to be used as the inpendent variable
list1=[0 for i in range(27820)]
for index,rows in df.iterrows():
    if rows["suicides/100k pop"]==0:
        list1[index]=0
    else:
        list1[index]=1
suicide=pd.DataFrame(list1)
df=pd.concat([df,suicide],axis=1)
df.drop(["suicides/100k pop"],axis=1,inplace=True)
df.rename({0:"suicide"},axis=1,inplace=True)

#Removing NaN values from dataset
df = df.dropna(how='any',axis=0)

#converting categorical data into numrical values
le_Gender=LabelEncoder() #Creating reference of LabelEncoder.
df["country_b"]=le_Gender.fit_transform(df["country"])
df["country_year_b"]=le_Gender.fit_transform(df["country-year"])
df["sex_b"]=le_Gender.fit_transform(df["sex"])
df["age_b"]=le_Gender.fit_transform(df["age"])
df["generation_b"]=le_Gender.fit_transform(df["generation"])
df.drop(["sex","age", "country","country-year", "generation"],axis=1,inplace=True)
df.rename({"gdp_for_year (in lacs)":"gdp_for_year"},axis=1,inplace=True)

#performing normalization for higher accuracy
for column in ["year","suicides_no","HDI for year", "generation_b"]:
    min_b=float(df[column].min())
    max_b=float(df[column].max())
for index,row in df.iterrows():
    df[column]=df[column].replace(row[column],(row[column]-min_b)/(max_b-min_b))
df.drop(["generation_b", "year", "country_year_b"],axis=1,inplace=True)

#Creating a heat map to check correlation bwtween the independent and dependent variables
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size
sns.heatmap(df.corr(),annot=True)

#Selection of Dependent & Independent columns.
x=df.drop(["suicide"],axis=1) #Independent Variables
y=df["suicide"] #Dependent Variable

#Importing common modules for all models.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Logistics Rgression Mdel
for k_ in range(3,5):      
    bestfeatures=SelectKBest(score_func=chi2,k=k_)
    fit=bestfeatures.fit(x,y) 
    dfscores=pd.DataFrame(fit.scores_)
    dfscor=dfscores[0].sort_values(ascending=False)
    features=dfscor.index.values
    X=x.iloc[ : ,features[0:k_]]

    #Spliting "X" into test and train set.
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=10)
            
    #Calling LogisticRegression constructor.
    logmodel=LogisticRegression()
    #Training the model.
    logmodel.fit(x_train,y_train)
    #Predicting the model.
    y_pred=logmodel.predict(x_test)
            
    print("For Top ",k_," Features, the analysis of Logistics Regression is as follows:")
    #Calculating Accuracy Score.
    acc=accuracy_score(y_test,y_pred)
    print("Accuracy Score is: ",acc)
    #Printing Confusion Matrix
    print("Confusion Matrix is:\n",confusion_matrix(y_test,y_pred))
            
    #Printing Classification Report
    print("Classification Report is:\n",classification_report(y_test,y_pred))
    #print("*******************************************************************************")

#Naive Bayes' Model
for k_ in range(3,5):
    #Feature selection in "x" on the basis of "k_"
    bestfeatures=SelectKBest(score_func=chi2,k=k_)
    fit=bestfeatures.fit(x,y)
    dfscores=pd.DataFrame(fit.scores_)

    dfscor=dfscores[0].sort_values(ascending=False)
    features=dfscor.index.values
    X=x.iloc[ : ,features[0:k_]]
    
    #Spliting "X" into test and train set.
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
            
    #Calling GaussianNB constructor.
    model=GaussianNB()
            
    #Training the model.
    model.fit(x_train,y_train)
    #Predicting the model.
    y_pred=model.predict(x_test)
            
    print("For Top ",k_," Features, the analysis of Naive Bayes is as follows:")     

    #Calculating Accuracy Score.
    acc=accuracy_score(y_test,y_pred)
    print("Accuracy Score is: ",acc)
    if k_==3:
        final_acc=acc

    #Printing Confusion Matrix
    print("Confusion Matrix is:\n",confusion_matrix(y_test,y_pred))
    #Printing Classification Report
    print("Classification Report is:\n",classification_report(y_test,y_pred))

