In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

coded_id = pd.read_csv("data/Social_spammers_dataset/users/coded_ids.csv")
features = pd.read_csv("data/Social_spammers_dataset/users_features/features.csv")
users = pd.merge(features, coded_id, on='user_id', how='left')

train = pd.read_csv("data/Social_spammers_dataset/users/coded_ids_labels_train.csv")
test = pd.read_csv("data/Social_spammers_dataset/users/coded_ids_labels_test.csv")
train = pd.merge(users, train, on='coded_id', how='right')
test = pd.merge(users, test, on='coded_id', how='right')

train.shape

test.shape

#cleaning train dataset

#remove columns with na values
nacolumns = train.columns[train.isna().any()].tolist()
train.drop(columns = nacolumns,inplace=True)
train.dropna(axis=0,inplace=True)

#remove unwanted category columns like time and IDs
unwanted = ["default_profile","default_profile_image","avg_intertweet_times","date_newest_tweet","lang","min_intertweet_times","std_nb_symbols_per_tweet","std_nb_symbols_per_word_in_the_tweet","date_oldest_tweet","max_intertweet_times","max_nb_symbols_per_tweet","max_nb_symbols_per_word_in_the_tweet","std_intertweet_times","user_id","coded_id"]
train.drop(columns=unwanted,inplace=True)

#remove columns like 
allzero_cols = list(train.loc[:,(train==0).all()].columns)
train.drop(columns=allzero_cols,inplace=True)

#cleaning test dataset

#remove columns with na values
nacolumns = test.columns[test.isna().any()].tolist()
test.drop(columns = nacolumns,inplace=True)
test.dropna(axis=0,inplace=True)

#remove unwanted category columns like time and IDs
unwanted = ["default_profile","default_profile_image","avg_intertweet_times","date_newest_tweet","lang","min_intertweet_times","std_nb_symbols_per_tweet","std_nb_symbols_per_word_in_the_tweet","date_oldest_tweet","max_intertweet_times","max_nb_symbols_per_tweet","max_nb_symbols_per_word_in_the_tweet","std_intertweet_times","user_id","coded_id"]
test.drop(columns=unwanted,inplace=True)

#remove columns like 
allzero_cols = list(test.loc[:,(test==0).all()].columns)
test.drop(columns=allzero_cols,inplace=True)

#split data into x features and labels
x = train.iloc[:,:124]

#normalizing the numerical features
scale = StandardScaler()
x = scale.fit_transform(x)
test = scale.fit_transform(test)

y = train.label

x.shape

y.value_counts()

#train test (validation) split
x_train, x_test, y_train, y_test = train_test_split(x,y ,stratify=y, test_size=0.15,random_state=1)

#function for performing grid search

def grid_search(model,params):
  skf = StratifiedKFold(n_splits=10)
  gs = GridSearchCV(model, cv=skf, param_grid=params, return_train_score=True)
  gs.fit(x_train, y_train)
  print("Accuracy =",gs.best_score_)
  print("\n")


from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('-----naive_bayes------')
print('Model train accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
print('Model test accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))
print("\n")

plot_confusion_matrix(model, x_test, y_pred_test) 
plt.title("Navie Bayes")
plt.show()


#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('-----KNeighborsClassifier------')
print('Model train accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
print('Model test accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))
print("\n")

plot_confusion_matrix(model, x_test, y_pred_test) 
plt.title("KNeighborsClassifier")
plt.show()

#Support Vector Machine
from sklearn.svm import SVC

model = SVC()
model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
params = {"C":[0.1,1,10],
          "kernel":["linear", "poly", "rbf", "sigmoid"]}
print('-----SVC------')
grid_search(model,params)

plot_confusion_matrix(model, x_test, y_pred_test) 
plt.title("Support Vector Machine")
plt.show()


#decision tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
params = {"criterion":["gini","entropy"],
          "max_depth":range(1,10),
          "min_samples_split":range(2,10),
          "min_samples_leaf":range(1,5)}
print('-----DecisionTreeClassifier------')
grid_search(model,params)

plot_confusion_matrix(model, x_test, y_pred_test) 
plt.title("DecisionTreeClassifier")
plt.show()

#random forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
params = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
print('-----RandomForestClassifier------')
grid_search(model,params)

plot_confusion_matrix(model, x_test, y_pred_test) 
plt.title("RandomForestClassifier")
plt.show()

from matplotlib import pyplot as plt

barWidth = 0.25
 


# set height of bar
bars1 = [0.84,0.89,0.95,0.95,0.96]
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
 
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.ylabel('F1 Scores', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['Naives-Bayes', 'KNN', 'SVM', 'Dec. Tree', 'R. Forest'])
 
# Create legend & Show graphic
plt.legend()