In [None]:
import sys
import csv
import dtetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime 
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
import gender_guesser.detector as gender
from sklearn import preprocessing
from sklearn.metrics import roc_curve,auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedFold, train_test_split
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [None]:
def read_datasets():
    """ Reads users profile from csv files """
    actual_users = pd.read_csv("C:\\Users\\SAHASRI\\Downloads\\actual_users.csv")
    fake_users = pd.read_csv("C:\\Users\\SAHASRI\\Downloads\\fake_users.csv")
    # print genuine_users.columns
    # print genuine_users.describe()
    #print fake_users.describe()
    x=pd.concat([actual_users,fake_users])   
    y=len(fake_users)*[0] + len(actual_users)*[1]
    return x,y


In [None]:
def predict_sex(name):
    d=gender.Detector()
    first_name= name.str.split(' ').str.get(0)
    sex= first_name.apply(d.get_gender)
    sex_dict={'female': -2, 'mostly_female': -1,'unknown':0,'mostly_male':1, 'male': 2,'únknown':-2,'andy':2}
    sex_code = sex.map(sex_dict).astype(int)
    return sex_code

In [None]:
def extract_features(x):
    lang_list = list(enumerate(np.unique(x['lang'])))   
    lang_dict = { name : i for i, name in lang_list }             
    x.loc[:,'lang_code'] = x['lang'].map( lambda x: lang_dict[x]).astype(int)    
    feature_columns_to_use = ['statuses_count','followers_count','friends_count','favourites_count','listed_count','sex_code','lang_code']
    x=x.loc[:,feature_columns_to_use]
    return x

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    target_names=['Fake','Actual']
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
def plot_roc_curve(y_test, y_pred):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)

    print ("False Positive rate: ",false_positive_rate)
    print ("True Positive rate: ",true_positive_rate)


    roc_auc = auc(false_positive_rate, true_positive_rate)

    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


In [None]:
def train(X_train,y_train,X_test):
    
    """ Trains and predicts dataset with a  Random Forest  classifier """
    clf=RandomForestClassifier(n_estimators=40,oob_score=True)
    clf.fit(X_train,y_train)
    print("The best classifier is: ",clf)
    #estimate score
    scores=cross_val_score(clf,X_train,y_train,cv=5)
    print(scores)
    print('Estimatedscore: %0.5f(+/-%0.5f)' % (score.mean(),score.std()/2))
    title='Learning Curves (Random Forest)'
    plot_learning_curve(clf,title,X_train,y_train,cv=5)
    plt.show()
    #predict
    y_pred=clf.predict(X_test)
    return y_test,y_pred

In [None]:
print ("reading datasets.....\n")
x,y=read_datasets()
x.describe()


In [None]:
print ("extracting featues.....\n")
x=extract_features(x)
x.columns
x.describe()

In [None]:
print ("training datasets.......\n")
y_test,y_pred =train(X_train,y_train,X_test)


In [None]:
print ('Classification Accuracy on Test dataset: ' ,accuracy_score(y_test, y_pred))

In [None]:
print ('Percent Error on Test dataset: ' ,percentError(y_pred,y_test))


In [None]:
cm=confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plot_confusion_matrix(cm)


In [None]:
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')


In [None]:
print(classification_report(y_test, y_pred, target_names=['Fake','Actual']))



In [None]:
s=roc_auc_score(y_test, y_pred)
print ("roc_auc_score : ",s)

In [None]:
plot_roc_curve(y_test, y_pred)
