In [None]:
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

from google.colab import files as colab_files
uploaded = colab_files.upload()

Saving amazonNetwork.csv to amazonNetwork (2).csv
Saving reviewTest.csv to reviewTest (2).csv
Saving reviewTrain.csv to reviewTrain (2).csv


In [None]:
amazonNetwork = pd.read_csv('amazonNetwork.csv')
reviewTrain = pd.read_csv('reviewTrain.csv')
reviewTest = pd.read_csv('reviewTest.csv')

**2.1 Part 1: Exploratory Social Network Analysis**

In [None]:
# first impression of the Amazon network data
amazonNetwork.head(3)

Unnamed: 0,FromNodeId,ToNodeId
0,1,2
1,1,4
2,1,5


In [None]:
# construct the graph based on the Amazon network
G = nx.from_pandas_edgelist(amazonNetwork, source='FromNodeId',target='ToNodeId', create_using=nx.DiGraph())

Question b. How many items are present in the network and how many co-purchases happened?

In [None]:
# Given each node is a single item in a network, so the total number of nodes in the network is item number.
print('The items are present in the network is ', G.number_of_nodes())
# Co-purchases mean a combo purchase is what we want,so it shall be the total number of edges.
print('The edges are present in the network is ', G.number_of_edges())

The items are present in the network is  2647
The edges are present in the network is  10841


Question c. Compute the average shortest distance between the nodes in graph G. 
Explain your results briefly.

In [None]:
# The average shortest path length is around 6.77, which is not relatively short distance, so it takes approximately a distance of 6.77 to conduct a co-purchase.
print('The average shortest path length is ', nx.average_shortest_path_length(G))

The average shortest path length is  9.592795477759587


Question d. Compute the transitivity and the average clustering coefficient of the network graph G. Explain your findings briefly based on the definitions of clustering coefficient and transitivity.

In [None]:
print('The transitivity of the graph G is ', nx.transitivity(G))
print('The average clustering coefficient of the graph G is ', nx.average_clustering(G))

The transitivity of the graph G is  0.4339169154480595
The average clustering coefficient of the graph G is  0.4086089178720651


Question e. Apply the PageRank algorithm to network G with damping value 0.5 and find the 10 nodes with the highest PageRank. Explain your findings briefly.
NetworkX document of the PageRank algorithm: https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html

In [None]:
pageRanking = nx.pagerank(G, alpha=0.5) 
pageranks = {k:v for k, v in sorted(pageRanking.items(), key=lambda t: t[1], reverse=True)}

In [None]:
highest_pagerank_nodes = list(pageranks.keys())[0:10]
count = 1
for node in highest_pagerank_nodes:
    print(f"{count}. {node}")
    count += 1

1. 8
2. 481
3. 33
4. 18
5. 23
6. 30
7. 346
8. 99
9. 93
10. 21


> The higher the page ranking, the higher possibility they have for co-purchase. The above ten items mean such situations with higher chances could happen to them.



# Feature Engineering

In [None]:
# to check the number of unique item ID in datafraem of reviewTrain
print(f'There are {reviewTrain.id.nunique()} unique item ID in the data frame of reviewTrain.')
# reviewTrain.id.nunique()

There are 1674 unique item ID in the data frame of reviewTrain.


In [None]:
reviewTrain_IDs = set(reviewTrain.id)
amazonNetwork_IDs = set(amazonNetwork.FromNodeId).union(set(amazonNetwork.ToNodeId))
IDs_gap = reviewTrain_IDs - amazonNetwork_IDs

print(f'The gap between the number of the training set of reviews and the number of Amazon network is {len(IDs_gap)}.')

The gap between the number of the training set of reviews and the number of Amazon network is 21.


> Based on the above calculation, I found that 21 is the gap between the number of reviewTrain's IDs and the number of Amazon Network's IDs.



Given the original data is associated with network-structure, its data shall be strongly connected the theory of network, such as below: 
1.   clustering coefficient
2.   degree centrality
3.   closeness centrality
4.   betweenness centrality

The mentioned significant points from network will be added into my data frame as features.

In [None]:
clustering = nx.clustering(G)
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

In [None]:
page_ranking_column = pd.DataFrame.from_dict(pageranks, orient='index', columns=['page_rank'])
clustering_column = pd.DataFrame.from_dict(clustering, orient='index', columns=['clustering'])
degree_centrality_column = pd.DataFrame.from_dict(degree_centrality, orient='index', columns=['degree_centrality'])
closeness_centrality_column = pd.DataFrame.from_dict(closeness_centrality, orient='index', columns=['closeness_centrality'])
betweenness_centrality_column = pd.DataFrame.from_dict(betweenness_centrality , orient='index', columns=['between_centrality'])

In [None]:
# Given that the training set has some gap with the network used into the last problem, left merge helps me avoid the gap values.
reviewTrain = reviewTrain.merge(page_ranking_column, left_on='id', right_index=True, how='left')
reviewTrain = reviewTrain.merge(clustering_column, left_on='id', right_index=True, how='left')
reviewTrain = reviewTrain.merge(degree_centrality_column, left_on='id', right_index=True, how='left')
reviewTrain = reviewTrain.merge(closeness_centrality_column, left_on='id', right_index=True, how='left')
reviewTrain = reviewTrain.merge(betweenness_centrality_column, left_on='id', right_index=True, how='left')

> In case any missing values appear due to the above steps and influence the final, I decide to drop all the missing values.

In [None]:
reviewTrain.dropna(inplace=True)

In [None]:
# five unique values exist in the column called group, but they are string, so they need numeric transformation to help future prediction
pd.unique(reviewTrain['group'])

array([' Book', ' Music', ' DVD', ' Video', ' Toy'], dtype=object)

In [None]:
# transform the group types into numeric types (classifical) to help the model
groupencoder = LabelEncoder()
reviewTrain['group'] = groupencoder.fit_transform(reviewTrain['group'])

> The column **group** could be useful since it entails five unique values.



In [None]:
reviewTrain.corr()

Unnamed: 0,id,group,review,page_rank,clustering,degree_centrality,closeness_centrality,between_centrality
id,1.0,-0.003071,0.033189,-0.356874,0.190649,-0.47567,-0.437823,-0.327841
group,-0.003071,1.0,0.140767,-0.005651,0.001618,0.001436,0.003376,-0.016622
review,0.033189,0.140767,1.0,-0.023928,0.023958,-0.024548,-0.015654,-0.002024
page_rank,-0.356874,-0.005651,-0.023928,1.0,-0.171188,0.888381,0.436432,0.576272
clustering,0.190649,0.001618,0.023958,-0.171188,1.0,-0.119005,-0.166597,-0.276858
degree_centrality,-0.47567,0.001436,-0.024548,0.888381,-0.119005,1.0,0.388936,0.528611
closeness_centrality,-0.437823,0.003376,-0.015654,0.436432,-0.166597,0.388936,1.0,0.381034
between_centrality,-0.327841,-0.016622,-0.002024,0.576272,-0.276858,0.528611,0.381034,1.0


> Based on the pairwise correlation computing of the training set, it seems that the column group, page_rank, clustering, degree_centrality, closeness_centrality indicate fine correlation results.



In [None]:
# choose the five features that I wanna keep
X_train = reviewTrain.drop(['review', 'title', 'between_centrality'], axis=1)

> Based on the analysis of the correlation of the training set of reviews, the columns group, page_rank, clustering, degree_centrality, closeness_centrality should be under consideration.





In [None]:
# encode review ratings, and this is the target
reviewencoder = LabelEncoder()
y_train = reviewencoder.fit_transform(reviewTrain['review'])

# Classification Experiments

A voting classifier can help me make my classification work reasonably. Five claassification types are included here: K-nearest Neighbors Vote Classifier, Gaussian Process Classifier, Decision Tree Classifier, Random Forest Classifier, Multi-layer Perceptron Classifier. 

Although AdaBoosting Classifier and Logistic Regression experienced experiments, they do not perform well, so they are not in the final plan. Through tuning the above mentioned classifiers, they find the optimal parameters via Grid SeachCV.

In [None]:
classifiers = [
    ("Logistic Regression",  LogisticRegression()),
    ("KNN", KNeighborsClassifier()), 
    ("Gaussian Process", GaussianProcessClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()), 
    ("Multi-layer Perceptron", MLPClassifier())
]

In [None]:
grid_param = [
            # LogisticRegression
            [{
            'fit_intercept': [True, False], 
            'solver': ['newton-cg', 'liblinear'],
            'random_state': [42],
            'max_iter': [1000],
             }],
            # KNeighborsClassifier 
            [{
            'n_neighbors': [1, 2, 3, 4, 5, 6, 7], 
            'weights': ['uniform', 'distance'], 
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }],
            # GaussianProcessClassifier
            [{    
            'max_iter_predict': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
            'random_state': [42]
            }],
            
            [{
            # DecisionTreeClassifier
            'criterion': ['gini', 'entropy'], 
            'max_depth': [2, 4, 6, 8, 10, None], 
            'random_state': [42]
             }],
            # RandomForestClassifier
            [{
            'n_estimators': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
            'criterion': ['gini', 'entropy'], 
            'max_depth': [2, 4, 6, 8, 10, None], 
            'random_state': [42]
             }],
            # MLPClassifer
            [{
            'activation': ['logistic', 'tanh', 'relu'],
            'alpha': [0.0001, 0.001, 0.01, 1, 10],
            'random_state': [42],
            'max_iter': [1000]
            }]
        ]

In [None]:
for combo, param_settings in zip(classifiers, grid_param):  
    best_search = GridSearchCV(estimator=combo[1], param_grid=param_settings, cv=5, scoring='neg_mean_absolute_error')
    best_search.fit(X_train, y_train)

    best_param = best_search.best_params_
    # print('The best parameter for {} is {}.'.format(combo[1].__class__.__name__, best_param))
    combo[1].set_params(**best_param) 



In [None]:
hard_vote_clf = VotingClassifier(estimators=classifiers, voting='hard')
hard_vote_clf.fit(X_train, y_train)
mae_hard = mean_absolute_error(y_train, hard_vote_clf.predict(X_train))
print("Training MAE by hard voting: %0.2f" % (mae_hard))

soft_vote_clf = VotingClassifier(estimators=classifiers, voting='soft')
soft_vote_clf.fit(X_train, y_train)
mae_soft = mean_absolute_error(y_train, soft_vote_clf.predict(X_train))
print("Training MAE by soft voting: %0.2f" % (mae_soft))

Training MAE by hard voting: 1.85
Training MAE by soft voting: 1.74


In [None]:
# higher MAE, dropped
classifiers = [
    ("KNN", KNeighborsClassifier()), 
    ("Gaussian Process", GaussianProcessClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()), 
    ("Multi-layer Perceptron", MLPClassifier()),
    ("AdaBoost", AdaBoostClassifier())
]

In [None]:
grid_param = [
    # KNeighborsClassifier
    [{
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7], 
        'weights': ['uniform', 'distance'], 
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }],
    
    # GaussianProcessClassifier
    [{
        'max_iter_predict': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
        'random_state': [42]
    }],
    
    # DecisionTreeClassifier
    [{
        'criterion': ['gini', 'entropy'], 
        'max_depth': [2, 4, 6, 8, 10, None], 
        'random_state': [42]
    }],
    
    # RandomForestClassifier
    [{
        'n_estimators': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
        'criterion': ['gini', 'entropy'], 
        'max_depth': [2, 4, 6, 8, 10, None], 
        'random_state': [42]
    }],

    # MLPClassifer
    [{
        'activation': ['logistic', 'tanh', 'relu'],
        'alpha': [0.0001, 0.001, 0.01, 1, 10],
        'random_state': [42],
        'max_iter': [1000]
    }]
]

In [None]:
for combo, param_settings in zip(classifiers, grid_param):  
    best_search = GridSearchCV(estimator=combo[1], param_grid=param_settings, cv=5, scoring='neg_mean_absolute_error')
    best_search.fit(X_train, y_train)

    best_param = best_search.best_params_
    combo[1].set_params(**best_param) 



In [None]:
hard_vote_clf = VotingClassifier(estimators=classifiers, voting='hard')
hard_vote_clf.fit(X_train, y_train)
mae_hard = mean_absolute_error(y_train, hard_vote_clf.predict(X_train))
print("Training MAE by hard voting: %0.2f" % (mae_hard))

soft_vote_clf = VotingClassifier(estimators=classifiers, voting='soft')
soft_vote_clf.fit(X_train, y_train)
mae_soft = mean_absolute_error(y_train, soft_vote_clf.predict(X_train))
print("Training MAE by soft voting: %0.2f" % (mae_soft))

Training MAE by hard voting: 1.54
Training MAE by soft voting: 1.67


In [None]:
# Final model
classifiers = [
    ("KNN", KNeighborsClassifier()), 
    ("Gaussian Process", GaussianProcessClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()), 
    ("Multi-layer Perceptron", MLPClassifier())
]

In [None]:
grid_param = [
            [{
            # KNeighborsClassifier 
            'n_neighbors': [1, 2, 3, 4, 5, 6, 7], 
            'weights': ['uniform', 'distance'], 
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }],
    
            [{    
            # GaussianProcessClassifier
            'max_iter_predict': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
            'random_state': [42]
            }],
            
            [{
            # DecisionTreeClassifier
            'criterion': ['gini', 'entropy'], 
            'max_depth': [2, 4, 6, 8, 10, None], 
            'random_state': [42]
             }],
            
            [{
            # RandomForestClassifier
            'n_estimators': [5, 10, 15, 20, 25, 30, 40, 50, 100], 
            'criterion': ['gini', 'entropy'], 
            'max_depth': [2, 4, 6, 8, 10, None], 
            'random_state': [42]
             }],
    
            [{
            # MLPClassifer
            'activation': ['logistic', 'tanh', 'relu'],
            'alpha': [0.0001, 0.001, 0.01, 1, 10],
            'random_state': [42],
            'max_iter': [1000]
            }]
        ]

In [None]:
for combo, param_settings in zip(classifiers, grid_param):  
    best_search = GridSearchCV(estimator=combo[1], param_grid=param_settings, cv=5, scoring='neg_mean_absolute_error')
    best_search.fit(X_train, y_train)

    best_param = best_search.best_params_
    combo[1].set_params(**best_param)



Now I have selected the components of the voting classifier, and tuned the parameters. I still need to choose from hard voting and soft voting in the voting classifier.



In [None]:
hard_vote_clf = VotingClassifier(estimators=classifiers, voting='hard')
hard_vote_clf.fit(X_train, y_train)
mae_hard = mean_absolute_error(y_train, hard_vote_clf.predict(X_train))
print("Training MAE by hard voting: %0.2f" % (mae_hard))

soft_vote_clf = VotingClassifier(estimators=classifiers, voting='soft')
soft_vote_clf.fit(X_train, y_train)
mae_soft = mean_absolute_error(y_train, soft_vote_clf.predict(X_train))
print("Training MAE by soft voting: %0.2f" % (mae_soft))

Training MAE by hard voting: 1.47
Training MAE by soft voting: 1.67


The hard voting one achieves a lower training MAE score, so it serves my final model.


In [None]:
reviewTest.head()

In [None]:
reviewTest = reviewTest.merge(page_ranking_column, left_on='id', right_index=True, how='left')
reviewTest = reviewTest.merge(clustering_column, left_on='id', right_index=True, how='left')
reviewTest = reviewTest.merge(degree_centrality_column, left_on='id', right_index=True, how='left')
reviewTest = reviewTest.merge(closeness_centrality_column, left_on='id', right_index=True, how='left')

In [None]:
# fill in missing values with 0
reviewTest.fillna(0, inplace=True)
reviewTest['group'] = groupencoder.transform(reviewTest['group'])
reviewTest.head()

Unnamed: 0,id,title,group,review,page_rank,clustering,degree_centrality,closeness_centrality
0,90,The Eagle Has Landed,0,0.0,0.000347,0.25,0.003779,0.116428
1,1372,Che in Africa: Che Guevara's Congo Diary,0,0.0,0.0003,0.288462,0.003023,0.080232
2,1382,The Darwin Awards II : Unnatural Selection,0,0.0,0.000338,0.75,0.003401,0.063412
3,253,Celtic Glory,2,0.0,0.000268,0.75,0.002268,0.072458
4,671,Sublte Aromatherapy,0,0.0,0.000358,0.5625,0.003401,0.09362


In [None]:
reviewTest['review'] = reviewencoder.inverse_transform(hard_vote_clf.predict(reviewTest.drop(['review', 'title'], axis=1)))
reviewTest['group'] = groupencoder.inverse_transform(reviewTest['group'])
reviewTest.head()

Unnamed: 0,id,title,group,review,page_rank,clustering,degree_centrality,closeness_centrality
0,90,The Eagle Has Landed,Book,0.0,0.000347,0.25,0.003779,0.116428
1,1372,Che in Africa: Che Guevara's Congo Diary,Book,0.0,0.0003,0.288462,0.003023,0.080232
2,1382,The Darwin Awards II : Unnatural Selection,Book,0.0,0.000338,0.75,0.003401,0.063412
3,253,Celtic Glory,Music,4.5,0.000268,0.75,0.002268,0.072458
4,671,Sublte Aromatherapy,Book,0.0,0.000358,0.5625,0.003401,0.09362


The predicted results are shown above, and I save the results as a csv, which will be uploaded on Canvas.



In [None]:
reviewTest[['id', 'title', 'group', 'review']].to_csv('reviewTestResult.csv', index=False)