**Importing Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

# **NYPD SHOOTING INCIDENT DATA**

---






**Objective**

We have leveraged the dataset to derive a solution to the following business questions:

*   Trend in number of incidents in each precinct
* Which Boroughs are more unsafe than others?

*   Which age group and race account for the highest number of perpetrators?

*   Which jurisdiction has observed the highest number of incidents?
*   How many incidents resulted in murder?
*   Which age group of victims are mostly targeted by perpetrators?
*   Trend of the incidents reported from 2006 to 2019 - which year has observed the highest number of shooting incidents in New York?
*   Predictive model to predict the age-group, sex, race of the victim and location of the incident
*   Which is the best Predictive analytical method for the dataset?


---







**Importing Data**

In [None]:
#Creating a data frame and storing NYPD Shooting Incident Data
df = pd.read_csv('../input/nypd-shooting-history/NYPD_Shooting_Incident_Data__Historic_.csv')

# **Section 1: DATA CLEANING**

In [None]:
#Checking the number of columns present in the dataframe.
df.columns

In [None]:
#Checking the number of columns present with datatypes(with the help of info command).
df.info()

In [None]:
#Getting an insight of dataframe withe help of head command.
df.head()

**Feature Engineering: Working with Null values and outliers**

In [None]:
#Checking the number of null values present in the data set. 
df.isna().sum()

In [None]:
#Now we are calculating the percentage of null values present in the data set(column wise).
print('Percentage of Null values with respect to columns in the dataset')
100* df.isnull().sum()/len(df)

In [None]:
#Passing dataframe-df and column name-'PERP_RACE' as it conatins null values. This function will return 'PERP_RACE' column with 0 null values.
col = 'PERP_RACE'
df[col].value_counts()
s = df[col].value_counts(normalize=True)
print('Values present in the columns with percentage')
print(s)
missing = df[col].isnull()
df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)
print('Null values filled with appropriate data')
df[col].value_counts()

In [None]:
#Passing dataframe-df and column name-'PERP_AGE_GROUP' as it conatins null values. This function will return 'PERP_AGE_GROUP' column with 0 null values.
col = 'PERP_AGE_GROUP'
df[col].value_counts()
s = df[col].value_counts(normalize=True)
print('Values present in the columns with percentage')
print(s)
missing = df[col].isnull()
df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)
print('Null values filled with appropriate data')
df[col].value_counts()

In [None]:
#Passing dataframe-df and column name-'PERP_AGE_GROUP' as it conatins oultier value(224). This function will return 'PERP_AGE_GROUP' column after removeing outlier value mentioned in the function.
#val_rem(df,'PERP_AGE_GROUP','224')
df.drop(df.loc[df['PERP_AGE_GROUP']=='224'].index, inplace=True)
df['PERP_AGE_GROUP'].value_counts()

In [None]:
#Passing dataframe-df and column name-'PERP_AGE_GROUP' as it conatins oultier value(940). This function will return 'PERP_AGE_GROUP' column after removeing outlier value mentioned in the function.
#val_rem(df,'PERP_AGE_GROUP','940')
df.drop(df.loc[df['PERP_AGE_GROUP']=='940'].index, inplace=True)
df['PERP_AGE_GROUP'].value_counts()

In [None]:
#Passing dataframe-df and column name-'PERP_AGE_GROUP' as it conatins oultier value(1020). This function will return 'PERP_AGE_GROUP' column after removeing outlier valu mentioned in the function.
#val_rem(df,'PERP_AGE_GROUP','1020')
df.drop(df.loc[df['PERP_AGE_GROUP']=='1020'].index, inplace=True)

In [None]:
#Types of values with counts present in 'PERP_SEX' column.
df['PERP_SEX'].value_counts()

In [None]:
#Passing dataframe-df and column name-'PERP_SEX' as it conatins null values. This function will return 'PERP_SEX' column with 0 null values.
col = 'PERP_SEX'
df[col].value_counts()
s = df[col].value_counts(normalize=True)
print('Values present in the columns with percentage')
print(s)
missing = df[col].isnull()
df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)
print('Null values filled with appropriate data')
df[col].value_counts()

In [None]:
#Types of values with counts present in 'LOCATION_DESC' column.
df['LOCATION_DESC'].value_counts()

In [None]:
#Passing dataframe-df and column name-'LOCATION_DESC' as it conatins null values. This function will return 'LOCATION_DESC' column with 0 null values.
col = 'LOCATION_DESC'
df[col].value_counts()
s = df[col].value_counts(normalize=True)
print('Values present in the columns with percentage')
print(s)
missing = df[col].isnull()
df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)
print('Null values filled with appropriate data')
df[col].value_counts()

In [None]:
#Checking the types of values present in 'JURISDICTION_CODE' column.
df['JURISDICTION_CODE'].value_counts()

In [None]:
#Passing dataframe-df and column name-'JURISDICTION_CODE' as it conatins null values. This function will return 'JURISDICTION_CODE' column with 0 null values.
col = 'JURISDICTION_CODE'
df[col].value_counts()
s = df[col].value_counts(normalize=True)
print('Values present in the columns with percentage')
print(s)
missing = df[col].isnull()
df.loc[missing,col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)
print('Null values filled with appropriate data')
df[col].value_counts()

In [None]:
#After working on serveral columns containing null values and outliers. Validating with isna() command to see if null values are still present in our data set or not.
df.isna().sum()

# **Section 2: DESCRIPTIVE ANALYSIS**

Trend in the number of incidents in each precinct
*   Total number of precincts – 123
*   Highest cases observed in precincts (40-80)

In [None]:
def annot_plot(ax,w,h):                                    # function to add data to plot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for p in ax.patches:
        ax.annotate('{0:.1f}'.format(p.get_height()), (p.get_x()+w, p.get_height()+h))

# create a figure and axis 
fig, ax = plt.subplots() 
# count the occurrence of each class 
data = df['PRECINCT'].value_counts() 
# get x and y data 
points = data.index 
frequency = data.values 
# create bar chart 
ax.bar(points, frequency) 
# set title and labels 
ax.set_title('NYPD Shooting Incident in each precinct') 
ax.set_xlabel('PRECINCT') 
ax.set_ylabel('Frequency')

Visualizing the amount of incidents took place area wise


*   As per the the visualization, the most number of shooting incidents have been reported in Brooklyn followed by Bronx, Queens, Manhattan and Staten Island respectively




In [None]:
group_boro=df.groupby('BORO')['INCIDENT_KEY'].count().sort_values(ascending=False)
#Setting the figure size and limits.
plt.subplots(figsize=(15,8))
plt.ylim(0,10000,10000)
#Creating a bar plot to show the result.
ax = group_boro.plot(kind='bar',fontsize=12,color='grey')
#Defining axis labels and title for the graph.
plt.xlabel('Places',fontsize=12)
plt.ylabel('INCIDENT KEY',fontsize=12)
plt.title('Incident count by Places',fontsize=12)
#plotting the graph.
annot_plot(ax,0.2,1)
plt.show()

Perpetrators by race and age group


*   The visualization below highlights that maximum number of suspects belong to the race: black and age group: 18-24 followed by the race: White Hispanics and age group: 25-44


In [None]:
def annot_plot(ax,w,h):                                    # function to add data to plot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for p in ax.patches:
        ax.annotate('{0:.1f}'.format(p.get_height()), (p.get_x()+w, p.get_height()+h))

plt.figure(figsize=(18,7))
#Setting the limits for countplot shown below.
plt.ylim(0,6000,6000)
#Creating a countplot to show 'PERP_RACE' with hue-'PERP_AGE_GROUP'.
ax = sns.countplot('PERP_RACE',hue='PERP_AGE_GROUP',data=df,palette='Set2',)
plt.xticks(rotation = 45, ha = 'right')
#Placing the legend in the grapgh.
plt.legend(bbox_to_anchor=(0.9, 0.8), loc=2, borderaxespad=0.)
#ploting the graph.
annot_plot(ax,0.02,1)
plt.show()


Visualizing perpetrators by race for each age group


*   The visualization below highlights that maximum number of suspects belong to the race: black and age group: 18-24 followed by the race: White Hispanics and age group: 25-44


In [None]:
#Visualizing counts of each unique value present in 'PER_RACE' by grouping it age wise.
age_group = df.groupby('PERP_AGE_GROUP')['PERP_RACE'].value_counts()
groups = age_group.groupby('PERP_AGE_GROUP')
fig = plt.figure()
count = 1
#Creating a for loop to plot different bar plots age wise.
for year, group in groups:
    #adding subplots in the output.
    ax = fig.add_subplot(6,3,count) 
    ax.set_title(year)
    ax = group[year].plot.bar(figsize = (12,30), width = 0.8,color='orange')#creating bar plots
    
    count+=1;
    #Defining labels for the axis peresnt in the grapgh.
    plt.xlabel('')
    plt.yticks([])
    plt.ylabel('Count of PERP RACE')
    
    #Intializing a array and appending it with the height
    total_of_year = []
    for i in ax.patches:
        total_of_year.append(i.get_height())
    total = sum(total_of_year)
    for i in ax.patches:
        ax.text(round(i.get_x()+0.2,1),round(i.get_height()-1.5,1),s= round(i.get_height(),1),color="black",fontweight='bold') #adding data labels (total value of spendings ) to the bars
plt.tight_layout()
plt.show()

Visualizing the NYPD jurisdiction codes reported for every incident
*   Most incidents were reported in the Jurisdiction code 0 (Patrol) followed by Jurisdiction 2 (Housing) and Jurisdiction 1 (Transit) respectively

In [None]:
#Defining figure size and setting limits to the graphs.
plt.figure(figsize=(16,7))
plt.ylim(0,20000,20000)
#Creating a countplot for 'JURISDICTION_CODE'. To show the count of distinct values present in the column.
ax = sns.countplot('JURISDICTION_CODE',data=df)
annot_plot(ax,0.3,1)
plt.show()

Visualizing the jurisdiction codes reported per location for every incident


*   Most incidents were reported in the Jurisdiction code 0 (Patrol)
*   Highest number of incidents were reported in Brooklyn with the Jurisdiction code 0



In [None]:
#Setting the figure size and limits.
plt.figure(figsize=(16,7))
plt.ylim(0,8000,8000)
#Creating a countplot for 'JURISDICTION_CODE' by area. To show the count of distinct values present in the column area wise.
ax = sns.countplot('JURISDICTION_CODE',hue='BORO',data=df,palette='Set2')
annot_plot(ax,0.02,1)
plt.show()

Visualizing the shooting incidents that resulted in murders


*   Maximum number (17496) of incidents reported were not classified as murder as 4127 incidents resulted in murder


In [None]:
#Setting the figure size and limits.
plt.figure(figsize=(16,7))
plt.ylim(0,20000,20000)
#creating a count plot for 'STATISTICAL_MURDER_FLAG column. To show distict values present in the column.
ax = sns.countplot('STATISTICAL_MURDER_FLAG',data=df)
#plotting the graph.
annot_plot(ax,0.2,1)
plt.show()

Analyzing the shooting incidents that resulted in murders - categorized by the suspect's age group
*   Maximum number (17496) of incidents reported were not classified as murder
*   Maximum number of suspects belong to the age group 18-24 in cases of the incident resulting in murder or otherwise


In [None]:
#Setting the figure size and limits.
plt.figure(figsize=(16,7))
plt.ylim(0,8000,8000)
#Visualizing column 'STATISTICAL_MURDER_FLAG' by hue='PERP_AGE_GROUP'. To show the count of unique values present in 'STATISTICAL_MURDER_FLAG' column by age group. 
ax = sns.countplot('STATISTICAL_MURDER_FLAG',hue='PERP_AGE_GROUP',data=df,palette='Set1')
#plotting the graph.
annot_plot(ax,0.02,1)
plt.show()

Visualizing the victims per their gender
*   Black males are the most common victims followed by white hispanic males
* American Indian/Alaskan Natives are the least common victims


In [None]:
#Defining palette and style for seaborn.
sns.set_palette("GnBu_d")
sns.set_style('whitegrid')
#Setting figure size and axis limits.
plt.figure(figsize=(16,7))
plt.yticks(fontsize = 10)
plt.ylim(0,16000,16000)
#creating count plot of values present in 'VIC_RACE' column with setting hue to 'VIC_SEX'. To show 'VIC_RACE' by 'VIC_SEX wise.
ax = sns.countplot('VIC_RACE',hue='VIC_SEX',data=df,palette='Set1')
annot_plot(ax,0.02,1)
plt.show()


Visualizing victims categorized by race for every age group


*   Maximum number of victims belong to the age group: 18-24 followed by 25-44 and less than 18 respectively


In [None]:
#Visualizing counts of each unique value present in 'VIC_RACE' by grouping it to 'VIC_AGE_GROUP'.
vic_age = df.groupby('VIC_AGE_GROUP')['VIC_RACE'].value_counts()
groups = vic_age.groupby('VIC_AGE_GROUP')
fig = plt.figure()
count = 1
#Creating a for loop to plot different bar plots age wise.
for year, group in groups:
    #Creating subplots to show the output.
    ax = fig.add_subplot(6,3,count)
    ax.set_title(year)
    ax = group[year].plot.bar(figsize = (12,30), width = 0.8,color='indigo')#creating bar plots
    
    count+=1;
    #Defining labels for the axis peresnt in the graph.
    plt.xlabel('')
    plt.yticks([])
    plt.ylabel('Count of Victims(Race Wise)')
    
    
    total_of_year = []
    for i in ax.patches:
        total_of_year.append(i.get_height())
    total = sum(total_of_year)
    for i in ax.patches:
        ax.text(round(i.get_x()+0.2,1),round(i.get_height()-1.5,1),s= round(i.get_height(),1),color="black",fontweight='bold') #adding data labels (total value of spendings ) to the bars
plt.tight_layout()
plt.show()

Visualizing victims categorized by gender for every age group

*   The bar plots below highlight that Black males are the most common victims followed by White Hispanic males
*   Maximum number of victims belong to the age group: 18-24 followed by 25-44 and less than 18 respectively



In [None]:
#Visualizing counts of each unique value present in 'VIC_RACE' by grouping it to 'VIC_AGE_GROUP'.
a = df.groupby('VIC_AGE_GROUP')['VIC_SEX'].value_counts()
groups = a.groupby('VIC_AGE_GROUP')
fig = plt.figure()
count = 1
#Creating a for loop to plot different bar plots sex wise.
for year, group in groups:
    #Creating subplots to show the output.
    ax = fig.add_subplot(6,3,count)
    ax.set_title(year)
    ax = group[year].plot.bar(figsize = (13,35), width = 0.8,color='Red')#creating bar plots
    
    count+=1;
    #Defining labels for the axis present in the graph.
    plt.xlabel('')
    plt.yticks([])
    plt.ylabel('Count of Victims(Sex Wise)')
    
    
    total_of_year = []
    for i in ax.patches:
        total_of_year.append(i.get_height())
    total = sum(total_of_year)
    for i in ax.patches:
        ax.text(round(i.get_x()+0.2,1),round(i.get_height()-1.5,1),s= round(i.get_height(),1),color="black",fontweight='bold') #adding data labels (total value of spendings ) to the bars
plt.tight_layout()
plt.show()

In [None]:
#Creating two new columns in the dataframe to store INCIDENT-YEAR & MONTH from the column 'OCCUR_DATE'.
df["INCIDENT_YEAR"] = pd.DatetimeIndex(df["OCCUR_DATE"]).year
df["INCIDENT_MONTH"] = pd.DatetimeIndex(df["OCCUR_DATE"]).month

Count of incidents reported from 2006 to 2019
*   The visualization below highlights that there has been a gradual decrease in the number of incidents reported since 2006




In [None]:
#Creating a plotly histogram graph to showcase the count of incident reported in the specific year and months
fig = px.histogram(df, x="INCIDENT_YEAR", color="INCIDENT_MONTH", marginal="rug",
                   hover_data=df.columns)
fig.show()

Trend of incidents reported from 2006 to 2019

*   The visualization below highlights that there has been a gradual decrease in the number of incidents reported since 2006

*   There has been instances of slight increase in the number of cases in 2009 to 2011 and 2013 to 2014 but the trend soon started decreasing gradually.



In [None]:
#Creating a line grapgh to show the amount of incident occured year wise. This graph will help us to understand the current trends of incidents(whether its high or low).
df1 = df[['INCIDENT_YEAR','BORO','INCIDENT_KEY']] 
df1 = df1.groupby('INCIDENT_YEAR')[['INCIDENT_KEY']].count().reset_index()
df2 = pd.melt(df1, id_vars=['INCIDENT_YEAR'], value_vars=['INCIDENT_KEY'])
import plotly.express as px
fig = px.line(df2, x="INCIDENT_YEAR", y="value", color='variable', 
              title=f'ALL Incidents Reported')
fig.update_layout(yaxis_range=[100,2500])
fig.show()

# **PREDICTIVE ANALYSIS**

In [None]:
#Importing Lable Encoder to work with categorical variable.
from sklearn.preprocessing import LabelEncoder
# creating instance of labelencoder
labelencoder = LabelEncoder()
df['BORO'] = labelencoder.fit_transform(df['BORO'])
df['VIC_RACE'] = labelencoder.fit_transform(df['VIC_RACE'])
df['VIC_SEX'] = labelencoder.fit_transform(df['VIC_SEX'])
df['PERP_SEX'] = labelencoder.fit_transform(df['PERP_SEX'])
df['PERP_RACE'] = labelencoder.fit_transform(df['PERP_RACE'])
df['STATISTICAL_MURDER_FLAG'] = labelencoder.fit_transform(df['STATISTICAL_MURDER_FLAG'])
df['VIC_AGE_GROUP'] = labelencoder.fit_transform(df['VIC_AGE_GROUP'])
df['PERP_AGE_GROUP'] = labelencoder.fit_transform(df['PERP_AGE_GROUP'])
df.head()


**Predicting Victim Race**


*   Analyzing the most common race to be targeted with hate crime
*   The model has an accuracy of 73.8%



In [None]:
#Importing train_test_split to split our data into train and test data. 
from sklearn.model_selection import train_test_split
#Initializing varaible X and y before pasing it to train test split function.
X = df.drop(['VIC_RACE','LOCATION_DESC','OCCUR_DATE','OCCUR_TIME','X_COORD_CD','Y_COORD_CD','Latitude','Longitude','Lon_Lat'],axis=1) #feature variable
y = df['VIC_RACE'] #target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
#Importing Random Forest Classifier(ML model) to predict the outcomes.
from sklearn.ensemble import RandomForestClassifier
#Creating a object named rfc of RandomForestClassifier. 
rfc = RandomForestClassifier(n_estimators=600)
#fitting the training data.
rfc.fit(X_train,y_train)
#Now we will pass our x test data in the model to predict the outcomes. We are storing the predictions of the model in a varable mentioned below.
predictions = rfc.predict(X_test)
#Importing the metrics and accuracy score to check the performance of our model.
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Random Forest Model accuracy(in %):", metrics.accuracy_score(y_test, predictions)*100)

**Predicting the Borough (Location)**

*   Predicting the most unsafe borough
*   The model has an accuracy of 100%



In [None]:
#Importing train_test_split to split our data into train and test data. 
from sklearn.model_selection import train_test_split
#Initializing varaible X and y before pasing it to train test split function.
X = df.drop(['BORO','LOCATION_DESC','OCCUR_DATE','OCCUR_TIME','X_COORD_CD','Y_COORD_CD','Latitude','Longitude','Lon_Lat'],axis=1)
y = df['BORO']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
#Importing Random Forest Classifier(ML model) to predict the outcomes.
from sklearn.ensemble import RandomForestClassifier
#Creating a object named rfc of RandomForestClassifier. 
rfc = RandomForestClassifier(n_estimators=600)
#fitting the training data.
rfc.fit(X_train,y_train)
predictions = rfc.predict(X_test)
#Now we will pass our x test data in the model to predict the outcomes. We are storing the predictions of the model in a varable mentioned below.
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Random Forest Model accuracy(in %):", metrics.accuracy_score(y_test, predictions)*100)

**Predicting perpetrators's** **age**

*   Prediction the most likely age group of suspect
*   The model has an accuracy percentage of 41.7%



In [None]:
#Importing train_test_split to split our data into train and test data. 
from sklearn.model_selection import train_test_split
#Initializing varaible X and y before pasing it to train test split function.
X = df.drop(['PERP_AGE_GROUP','LOCATION_DESC','OCCUR_DATE','OCCUR_TIME','X_COORD_CD','Y_COORD_CD','Latitude','Longitude','Lon_Lat'],axis=1) #defining feature variable
y = df['PERP_AGE_GROUP'] #target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101) #importing test split function to segregate the data into training and test dataset
from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train) #using fit to train the model
predictions = rfc.predict(X_test) #by using predict function we will predict the target variable based on our test set(X_test)
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Random Forest Model accuracy(in %):", metrics.accuracy_score(y_test, predictions)*100) #Checking the predicitve accuracy of the model created.

**Predicting Victim's gender**


*   Predicting the most unsafe gender in New York
*   The model has an accuracy of 89.1%


In [None]:
#Importing train_test_split to split our data into train and test data. 
from sklearn.model_selection import train_test_split
#Initializing varaible X and y before pasing it to train test split function.
X = df.drop(['VIC_SEX','LOCATION_DESC','OCCUR_DATE','OCCUR_TIME','X_COORD_CD','Y_COORD_CD','Latitude','Longitude','Lon_Lat'],axis=1)
y = df['VIC_SEX']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
#Importing Random Forest Classifier(ML model) to predict the outcomes.
from sklearn.ensemble import RandomForestClassifier
#Creating a object named rfc of RandomForestClassifier. 
rfc = RandomForestClassifier(n_estimators=600)
#fitting the training data.
rfc.fit(X_train,y_train)
#Now we will pass our x test data in the model to predict the outcomes. We are storing the predictions of the model in a varable mentioned below.
predictions = rfc.predict(X_test)
#Importing the metrics and accuracy score to check the performance of our model.
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Random Forest Model accuracy(in %):", metrics.accuracy_score(y_test, predictions)*100)

# **KNN**

In [None]:
# selecting the columns needed
mydata = df[['INCIDENT_KEY','PRECINCT','JURISDICTION_CODE','X_COORD_CD']]
mydata

In [None]:
#checking the data types of columns
mydata.dtypes

In [None]:
#Initializing the varaibles X and y before applying it to train test split.
X = mydata[mydata.columns[0:3]]
y = mydata[['X_COORD_CD']]

In [None]:
#displaying values present in X varaible.
X

In [None]:
#checking value of y varaible.
y

**KNN: n=3**

*   For n=3, we got the score of 52.39 which is pretty less for a predictive model
*We need to consider the method with the highest score to get the higher accuracy for the dataset




In [None]:
y.dtypes

In [None]:
#creating a object of KNeighborsClassifier and fitting it with data. 
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X, y)

In [None]:
#printing the value predicted by KNeighborsClassifier model.
print(KNN.predict([[201575314,103,0.0]]))

In [None]:
#checking the score of the model.
KNN_score = KNN.score(X, y)
print(KNN_score)

**KNN:n=4**
* Here we have taken the n=4 to check if we get a greater score
* The score of KNN when n=4 is around 43.7 which is lesser than the previous score
* When we compare the both KNN scores we can choose n=3 over n=4 as it gives higher score



In [None]:
#creating a object of KNeighborsClassifier with 4 neighbors and fitting it with data. 
KNN = KNeighborsClassifier(n_neighbors=4)
KNN.fit(X, y)

In [None]:
#printing the value predicted by KNeighborsClassifier model.
print(KNN.predict([[201755314,103,0.0]]))

In [None]:
#checking the score of the model.
KNN_score = KNN.score(X, y)
print(KNN_score)

# **DECISION TREE**



*   When we applied the decision tree model on our dataset the score came up to 98.8%.

*   The score is almost close to 100%, so this method is most desirable of all and we can eliminate the cost and effort by using this model on top of other models (in our case KNN)

In [None]:
# loading the library
from sklearn import tree

In [None]:
#creating a object of DecisionTreeClassifier. 
dtree = tree.DecisionTreeClassifier()

In [None]:
#fitting it with data.
tree_model = dtree.fit(X, y)

In [None]:
#printing the value predicted by Decision Tree model.
print(tree_model.predict([[201575314,103,0.0]]))

In [None]:
#checking the score of decision tree model object.
DT_score = dtree.score(X, y)
print(DT_score)