In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif,SelectPercentile
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report 

data_no_additional_feature = '../data/training_user_tweet.csv'
data_with_tweet_csv = '../data/training_user_tweet.csv'
#tweets_user='tweets.csv'

In [27]:
from textblob import TextBlob
import pandas as pd
from nltk.corpus import stopwords
from math import factorial
import itertools as it
from numpy import linalg as LA

#%%
def modify_df(user_tweets):
    """
    remove unnecessary words from the user_tweets.csv
    
    Argument: user_tweets
    
    Return: user_tweets dataframe
    """

    special_remove = [str(tweets) for tweets in user_tweets['text']] 
    user_tweets['tweet_split'] = [tweets.lower().split() for tweets in special_remove]
    user_tweets['tweet_split'] = [filter(lambda x: not (x.startswith("@") or x.startswith("#") or x.startswith("https:") or x in stopwords.words("english") or x.startswith("rt") or x[0].isdigit()), tweet) for tweet in user_tweets['tweet_split']]
    user_tweets['tweet_split_string'] = [' '.join(str(x) for x in tweets) for tweets in user_tweets['tweet_split']]
    return user_tweets

#%%
def comb_2(user_tweets):
    """
    calculate the number of tweet combinations
    
    Argument: user_tweets
    
    Return: total number of tweet combinations
    """
    num_tweets = len(user_tweets['text'])
    return int(factorial(num_tweets) / (factorial(2) * factorial(num_tweets - 2)))

#%%
def sim_formula(user_tweets):
    """
    calculate set of pair in tweets
    
    Argument: user_tweets
    
    Return: set of pair in tweets ratio
    """
    if len(user_tweets) != 0: 
        user_tweets = modify_df(user_tweets)
        ind = [TextBlob(tweets).word_counts for tweets in user_tweets['tweet_split_string']]
        vector_df = pd.DataFrame(ind)
        vector_df = vector_df.fillna(0)
        vector_matrix = vector_df.as_matrix()
        idx = list(it.combinations(range(vector_df.shape[0]), 2))
        sim_dot = {}
        for i, j in idx:
            sim_dot[(i, j)] = vector_matrix[i, :].dot(vector_matrix[j,:])

        sim_norm = {}
        for x, y in idx:
            sim_norm[(x, y)] = LA.norm(vector_matrix[x, :]) * LA.norm(vector_matrix[y,:])
        
        dot_set = set(sim_dot)
        norm_set = set(sim_norm)

        sim_result = {}
        for key in dot_set.intersection(norm_set):
            if sim_norm[key] != 0:
                sim_result[key] = (sim_dot[key] / sim_norm[key])/comb_2(user_tweets)
        result_sum = sum(sim_result.values())
    else:
        result_sum = "None"

    return result_sum

In [16]:
# import pandas as pd
# #import similarity_2 as sim_2
# from pandas.io.common import CParserError
# import numpy as np

# #%%
# sample_data_id=pd.read_csv('tweets.csv')
# #sample_data_id = pd.read_csv(open("tweets.csv", 'rU'), encoding = 'utf-8', usecols = ['id'])
# sample_data_id['user_id'] = sample_data_id['user_id'].astype(float)
# sample_id = sample_data_id['user_id'].unique()

#%%


  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
from __future__ import division
"""
verify tweet‘s authenticity using url_ratio, url_unique_ratio, hashtag_ratio, username_ratio and username_unique_ratio
"""

#from compiler.ast import flatten
import re
import collections

def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el
            
#%%
def url_ratio(user_tweets):
    """
    calculate the percentage of 20 recent Tweets containing URLs
    
    Argument: tweets_df
    
    Return: tweets_url_ratio
    """
    if len(user_tweets) != 0: 
        top_20 = user_tweets[:20]
        tweets_url_ratio = sum(top_20['text'].str.contains("https:") == True)/len(user_tweets['text'])
    else:
        tweets_url_ratio = 'None'
    return tweets_url_ratio


#%%
def url_unique_ratio(user_tweets):
    """
    calculate the ratio of the number of unique URLs in the 20 recent tweets
    
    Argument: tweets_df
    
    Return: url_ratio
    """
    if len(user_tweets) != 0: 
        top_20 = user_tweets[:20]
        # find all the urls using regular expression
        urls = [re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(tweets)) for tweets in top_20['text']]
        # flatten a list of lists
        urls_flatten = flatten(urls)
        #urls_flatten = urls.flatten()
        # get the first two parts of the url
        urls_split = [u.split('/')[0:3] for u in urls_flatten]
        
        urls_unique = [list(u) for u in set(tuple(u) for u in urls_split)]
        url_unique= len(urls_unique)
        tweet_total = len(user_tweets['text'])
        url_ratio = url_unique/tweet_total
    else:
        url_ratio = 'None'
    return url_ratio

#%%
def hashtag_ratio(user_tweets):
    """
    calculate the hashtag ratio
    
    Argument: tweets_df
    
    Return: hashtag ratio
    """
    if len(user_tweets) != 0: 
        top_20 = user_tweets[:20]
        hashtag_ratio = sum(top_20['text'].str.contains("#"))/len(top_20['text'])
    else:
        hashtag_ratio = 'None'
    return hashtag_ratio

#%%
def username_ratio(user_tweets):
    """
    calculate the username ratio
    
    Argument: tweets_df
    
    Return: username ratio
    """
    if len(user_tweets) != 0: 
        top_20 = user_tweets[:20]
        username_ratio = sum(top_20['text'].str.contains("@"))/len(top_20['text'])
    else:
        username_ratio = 'None'
    return username_ratio
    
#%%
def username_unique_ratio(user_tweets):
    """
    calculate the ratio of the number of unique @usernames
    
    Argument: tweets_df
    
    Return: username_unique_ratio
    """
    if len(user_tweets) != 0: 
        top_20 = user_tweets[:20]
    
        username = [re.findall('@([A-Za-z0-9_]+)', str(tweets)) for tweets in top_20['text']]
        # flatten a list of lists
        username_flatten = flatten(username)
        #username_flatten = username.flatten()
        username_unique = set(username_flatten)
        user_unique= len(username_unique)
        # total number of users that were being @, not all the tweets
        tweet_total = len(user_tweets['text'])
        user_ratio = user_unique/tweet_total
    else:
        user_ratio = 'None'
    return user_ratio

In [56]:


files=['../data/datasets_full.csv/genuine_accounts.csv/tweets.csv',
       '../data/datasets_full.csv/social_spambots_1.csv/tweets.csv',
       '../data/datasets_full.csv/social_spambots_2.csv/tweets.csv',
       '../data/datasets_full.csv/social_spambots_3.csv/tweets.csv',
       '../data/datasets_full.csv/traditional_spambots_1.csv/tweets.csv',
       '../data/datasets_full.csv/fake_followers.csv/tweets.csv']

i=0
sample_tweet_analysis_ratio = pd.DataFrame(columns = ["id", "similarity_ratio_2","url_ratio", "url_unique_ratio", "hashtag_ratio", "username_ratio", "username_unique_ratio"])

for file in files:
    
    #path=  '⁨Users/harinath⁩/Downloads⁩/'+file
    sample_data_id=pd.read_csv(file)
    #sample_data_id = pd.read_csv(open("tweets.csv", 'rU'), encoding = 'utf-8', usecols = ['id'])
    sample_data_id['user_id'] = sample_data_id['user_id'].astype(float)
    sample_id = sample_data_id['user_id'].unique()

    

    
    for user in sample_id:
        t_df=sample_data_id.loc[sample_data_id['user_id'] == user]

        similarity_ratio_2=sim_formula(t_df)

        url_ratio_out = url_ratio(t_df)

        #%%
        url_unique_ratio_out = url_unique_ratio(t_df)

        #%%
        hashtag_ratio_out = hashtag_ratio(t_df)

        #%%
        username_ratio_out = username_ratio(t_df)

        #%%
        username_unique_ratio_out = username_unique_ratio(t_df)
        #similarity_ratio_2 = [sim_formula(sample_tw[int(i)]) for i in range(len(sample_tw))]
        sample_tweet_analysis_ratio.loc[i] = [user,similarity_ratio_2,url_ratio_out, url_unique_ratio_out, hashtag_ratio_out, username_ratio_out, username_unique_ratio_out]
        i=i+1

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [64]:
#merge these features with original table
data_org=pd.read_csv(data_no_additional_feature)
new_df = pd.merge(data_org, sample_tweet_analysis_ratio,  how='inner', left_on=['id',], right_on = ['id'])

In [71]:
trainY = new_df.loc[:,'label']
trainX=new_df[new_df.columns.difference(['label','id', 'Unnamed: 0', 'name', 'screen_name',
       'url', 'lang', 'time_zone', 'location', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_image_url',
       'profile_banner_url', 'profile_use_background_image',
       'profile_background_image_url_https', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_border_color',
       'profile_background_tile', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color',
       'profile_link_color', 'utc_offset', 'protected', 'verified',
       'description', 'created_at', 'updated', 'file'])]
#trainY=trainY.fillna(trainY.median())



trainX=np.array(trainX)
trainY =np.array(trainY)

In [2]:
#statuses count
#ff_ratio
#tweet rate
rem_features=['label','id', 'Unnamed: 0', 'name', 'screen_name',
       'url', 'lang', 'time_zone', 'location', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_image_url',
       'profile_banner_url', 'profile_use_background_image',
       'profile_background_image_url_https', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_border_color',
       'profile_background_tile', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color',
       'profile_link_color', 'utc_offset', 'protected', 'verified',
       'description', 'created_at', 'updated', 'file']

cols=['Features']
feature_importances=pd.DataFrame(columns=cols)
feature_importances['Features'] = new_df[new_df.columns.difference(rem_features)].columns
train_auc=[]
test_auc=[]
cv = StratifiedKFold(n_splits=10)
i = 0
for train, test in cv.split(trainX, trainY):
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    classifier = xgb.XGBClassifier(objective='binary:logistic' , booster='gbtree', eta=0.001,
                               max_depth=20,colsample_bytree=0.7, subsample=0.8,min_child_weight=5,gamma=0,
                                n_estimators=100, 
                                n_jobs=-1, verbose=False)
    #classifier=xgb.XGBClassifier(**params)
    classifier.fit(trainX[train], trainY[train],
                        eval_set=[(trainX[train], trainY[train]), 
                                  (trainX[test], trainY[test])],
                                eval_metric='logloss', verbose=False)
    
    evals_result= classifier.evals_result()
    text='Value_'+str(i)
    i=i+1
    feature_importances[text] = classifier.feature_importances_

    train_auc.append(evals_result['validation_0']['logloss'][99])
    test_auc.append(evals_result['validation_1']['logloss'][99])
    #print(evals_result)
    

predict = classifier.predict(trainX[test])
print(classification_report(trainY[test], predict))
    
mean_auc_train = np.mean(train_auc)
std_auc_train = 1.96*np.std(train_auc)
# 
mean_auc_test = np.mean(test_auc)
std_auc_test = 1.96*np.std(test_auc)
# 
print('Test set auc:' + str(mean_auc_train) + '(CI :'+ str(std_auc_train) + ')')
print('Train set auc:' + str(mean_auc_test) + '(CI :'+ str(std_auc_test) + ')')



NameError: name 'pd' is not defined