# Shardul Dabhane

# Problem 3: Changes in Kaggle Code for KNN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

#in Kaggle, File -> Add or upload data -> search for credit card
#note about the folder: ../input/creditcard
#change the folder if you have data in a different folder
data = pd.read_csv("../input/creditcard/creditcard.csv")
print(data.shape)
#data.head()
data.describe()

In [2]:
#check if there are missing data
data.isnull().any().any()

#change 'Class' dtype to "bool"
data['Class'] = data['Class'].astype('bool')

In [3]:
class_zero = data.Class.value_counts().values[0]
class_one = data.Class.value_counts().values[1]
print(data["Class"].value_counts())

In [4]:
sb.barplot(x=data.Class.value_counts().index.values, y=data.Class.value_counts().values)
plt.title("Class distribution")

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data['AmountNormalized'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['AmountNormalized'].describe()

In [6]:
X = data.iloc[:, data.columns != 'Class'].values
y = data.iloc[:, data.columns == 'Class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [7]:
def plot_precision_recall_curve(y_actual, y_score, model_name):
    precision, recall, _ = metrics.precision_recall_curve(y_actual, y_score)
    curve_data = pd.DataFrame(columns = range(0, len(precision)))
    curve_data.loc['Precision'] = precision
    curve_data.loc['Recall'] = recall
    #print (curve_data)
    plt.step(recall, precision, color='b', alpha=0.1, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.1, color='b')
    plt.title('Precision Recall Curve for {} Model'.format(model_name))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.xlim([0, 1.05])
    plt.ylim([0, 1.0])

def evaluate_model(y_actual, y_pred, y_score, model_name):
    cm = metrics.confusion_matrix(y_actual, y_pred)
    print ('Confusion Matrix for {} Model'.format(model_name))
    print (cm)
    print ('Classification Report for {} Model'.format(model_name))
    print (metrics.classification_report(y_actual, y_pred, digits=6))
    print ('Area under under ROC curve for {} Model'.format(model_name))
    print (metrics.roc_auc_score(y_actual, y_score))
    plot_precision_recall_curve(y_actual, y_score, model_name)

In [8]:
#KNN with minkowski metric, k=5 and p=2
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=5)')

In [9]:
#KNN with minkowski metric, k=10 and p=2

from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=10, metric= 'minkowski', p=2)
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=10)')

In [10]:
#KNN with minkowski metric, k=50 and p=2

from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=50, metric= 'minkowski', p=2)
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=50)')

In [11]:
#KNN with minkowski metric, k=100 and p=2
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=100, metric= 'minkowski', p=2)
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=100)')

In [12]:
# KNN with k=5 and metric='euclidean'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=5, metric= 'euclidean')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=5)')

In [13]:
# KNN with k=10 and metric='euclidean'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=10, metric= 'euclidean')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=10)')

In [14]:
# KNN with k=50 and metric='euclidean'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=50, metric= 'euclidean')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=50)')

In [15]:
# KNN with k=100 and metric='euclidean'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=100, metric= 'euclidean')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=100)')

In [16]:
# KNN with k=5 and metric='manhattan'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=5, metric= 'manhattan')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=5)')

In [17]:
# KNN with k=10 and metric='manhattan'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=10, metric= 'manhattan')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=10)')

In [18]:
# KNN with k=50 and metric='manhattan'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=50, metric= 'manhattan')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=50)')

In [19]:
# KNN with k=100 and metric='manhattan'
from sklearn.neighbors import KNeighborsClassifier
#train
knn = KNeighborsClassifier(n_neighbors=100, metric= 'manhattan')
knn.fit(X_train, y_train.ravel())
#test
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

evaluate_model(y_test, y_pred_knn, y_prob_knn[:, [1]], 'KNN (n=100)')

# Question 4: Link Analysis and PageRank algorithm implementation

In [20]:
#Pagerank implementation. 

#use numpy module
import numpy as np

# Define the function for page rank algorithm
def page_rank(transition_matrix):
    
    # Define the initial rank matrix
    rank_matrix = np.ones(6)/6
    
    # Get the dot product of the given matrix with rank
    dot_product = transition_matrix.dot(rank_matrix)
    
    # Repeat loop until ranks converge
    while True:
        dot_product = transition_matrix.dot(rank_matrix)
        if np.linalg.norm(np.abs(rank_matrix - dot_product)) < 0.01:
            break
        rank_matrix = dot_product
        
    return dot_product

In [23]:
# Given transition matrix as per the problem
transition_matrix = np.array([[ 0,  0.25,  0,  0,  0,  0.5],
       [ 0.5,  0,  0,  0,  0,  0],
       [ 0,  0.25,  0,  0,  0.5,  0.5],
       [ 0,  0.25,  1.0,  0,  0,  0],
       [ 0.5,  0,  0,  0.5,  0,  0],
       [ 0,  0.25,  0,  0.5,  0.5,  0]])
ranks_of_pages = page_rank(transition_matrix)
ranks_of_pages

**Page ranks of web pages here after implementing the pagerank algorithm**:

**1. D(0.22591146)**

**2. F(0.21028646)**

**3. C(0.20898438)**

**4. E(0.17057292)**

**5. A(0.12304687)**

**6. B(0.06119792)**

In [22]:
# Defining the function for page rank algorithm with different distribution

def page_rank_scaled(transition_matrix):
    # Scaled rank matrix
    rank_matrix = 100 * np.ones(6)/6
    
    # Get the dot product of the given matrix with rank
    dot_product = transition_matrix.dot(rank_matrix)
    
    # Repeat loop until ranks converge
    while True:
        dot_product = transition_matrix.dot(rank_matrix)
        
        if np.linalg.norm(np.abs(rank_matrix - dot_product)) < 0.001:
            break
        rank_matrix = dot_product
    return dot_product

In [24]:
ranks_of_pages = page_rank_scaled(transition_matrix)
ranks_of_pages

**Conclusion : The rank values for each page get scaled with scaling of
rank matrix but the overall ranks remain same with D having highest
rank and F, C, E, A, B in the decreasing order.**

References:

[1] Introduction to Data Mining 2nd Edition By Tan, Steinbach, Kumar,Karpatne

[2] https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

[3] https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

[4] https://notebook.community/hktxt/MachineLearning/PageRank

[5] https://www.geeksforgeeks.org/page-rank-algorithm-implementation