In [60]:
import pandas as pd
import re
import numpy as np
from nltk.stem.porter import *
import nltk
from sacremoses import MosesDetokenizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [42]:
def to_csv(address):
    twitter_df=pd.read_csv(address, sep="\t",names=['ID','Event','Type','Content'])
    return twitter_df

In [43]:
def remove_pattern(input_txt,pattern):
    r=re.findall(pattern, input_txt)
    for i in r:
        input_txt=re.sub(i, '', input_txt)
    return input_txt  

In [44]:
def preprocess(twitter_df,language):
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['Content'], "@[\w]*")
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['tidy_Content'], r"#(\w+)")
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['tidy_Content'], r'http://[a-zA-Z0-9.?/&=:]*')
    if language=='english':
        twitter_df['tidy_Content']=twitter_df['tidy_Content'].str.replace("[^a-zA-Z#]", " ",regex=True)
    twitter_df['tidy_Content']=twitter_df['tidy_Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

    token_tweet=twitter_df['tidy_Content'].apply(lambda x: x.split())
    stemmer=PorterStemmer()
    token_tweet=token_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

    detokenizer=MosesDetokenizer()
    for i in range(len(token_tweet)):
        token_tweet[i]=detokenizer.detokenize(token_tweet[i], return_str=True)
    twitter_df['tidy_Content']=token_tweet
    #########################################################################################################
    twitter_df['tidy_Content']=twitter_df['tidy_Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    #########################################################################################################
    return twitter_df

In [45]:
def plot_wordcloud(twitter_df,event):
    all_words=" ".join([text for text in twitter_df['tidy_Content'][twitter_df['Event']==event]])
    wordcloud=WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [46]:
# function to collect hashtags
def hashtag_extract(df):
    hashtags = []
    # Loop over the words in the tweet
    for i in df:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    return hashtags

In [47]:
def plot_histogram(HT):
    a = nltk.FreqDist(HT)
    d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())})
    # selecting top 10 most frequent hashtags     
    d = d.nlargest(columns="Count", n = 10) 
    plt.figure(figsize=(16,5))
    ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
    ax.set(ylabel = 'Count')
    plt.show()

In [48]:
def preprocess_y(twitter_df,language):
    if language=='english':
        tfidf_vectorizer=TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
    elif language=='arabic':
        tfidf_vectorizer=TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000)
    # TF-IDF feature matrix
    tfidf=tfidf_vectorizer.fit_transform(twitter_df['tidy_Content'])
    y_validation=pd.DataFrame(columns = ['Type'])
    y_validation['Type']=twitter_df['Type'].copy()

    #1:positive   0:negative   2:neutral
    for i in range(0,y_validation['Type'].size):
        if y_validation['Type'][i]=='positive':
            y_validation['Type'][i]=1
        elif y_validation['Type'][i]=='negative':
            y_validation['Type'][i]=0
    return y_validation,tfidf

In [49]:
def plot_one_event(event,twitter_df):
    plot_wordcloud(twitter_df,event)
    # extracting hashtags from positive tweets
    HT_event= hashtag_extract(twitter_df['Content'][twitter_df['Event']==event])
    # unnesting list
    HT_event=sum(HT_event,[])
    plot_histogram(HT_event)

In [50]:
def one_event(event,twitter_df):
    df=pd.DataFrame(columns=['ID','Type','Content','tidy_Content'])
    for i in range(0,twitter_df.index.size):
        if twitter_df['Event'][i]==event:
            df_new=pd.DataFrame(twitter_df.iloc[i])
            df_new_T=pd.DataFrame(df_new.values.T, index=df_new.columns, columns=df_new.index)
            df=pd.concat([df,df_new_T],axis=0)
            df=df.reset_index(drop=True)
    return df

In [51]:
def svm_design(x_train,x_test, y_train,y_test):#for SVM model
    classifier = svm.SVC(kernel='linear')#use linear instead of other kernels
    classifier.fit(x_train, y_train)#training model
    pred = classifier.predict(x_test)#make prediction
    print(accuracy_score(y_test, pred))#show results
    return accuracy_score(y_test, pred)

In [52]:
def knn_design(X_train,X_test, y_train,y_test,k):#for Knn model
    neigh = KNeighborsClassifier(n_neighbors=k)#classify data
    neigh.fit(X_train, y_train) # Fit KNN model
    return accuracy_score(y_test,neigh.predict(X_test))#return prediction

In [53]:
def svm_model(twitter_df,language):
    y_validation,tfidf=preprocess_y(twitter_df,language)
    x_train, x_test,y_train,y_test=train_test_split(tfidf, y_validation['Type'].astype('int'), random_state=42, test_size=0.2)
    x_train_valid, x_test_valid, y_train_valid, y_test_valid=train_test_split(tfidf[:(y_validation.size*4)//5],y_validation['Type'].astype('int')[:(y_validation.size*4)//5], random_state=None, test_size=0.25)

    acc_valid=svm_design(x_train_valid, x_test_valid, y_train_valid, y_test_valid)
    acc_test=svm_design(x_train, x_test,y_train,y_test)
    return acc_valid,acc_test

In [54]:
def knn_model(twitter_df,language):
    y_validation,tfidf=preprocess_y(twitter_df,language)
    x_train, x_test,y_train,y_test=train_test_split(tfidf, y_validation['Type'].astype('int'), random_state=42, test_size=0.2)
    x_train_valid, x_test_valid, y_train_valid, y_test_valid=train_test_split(tfidf[:(y_validation.size*4)//5],y_validation['Type'].astype('int')[:(y_validation.size*4)//5], random_state=None, test_size=0.25)

    acc_valid=knn_design(x_train_valid, x_test_valid, y_train_valid, y_test_valid,1)
    acc_test=knn_design(x_train, x_test,y_train,y_test,1)
    return acc_valid,acc_test

In [59]:
twitter_df=to_csv('../Datasets/twitter-2016train-BD.txt')
twitter_df=preprocess(twitter_df,'english')
all_event=twitter_df['Event'].value_counts()
all_event=all_event.index.values
v_avg_k=0
t_avg_k=0
v_avg_s=0
t_avg_s=0
for event in all_event:
    df=one_event(event,twitter_df)
    #print(event)
    #naive_bayes(y_validation,tfidf)
    #acc_v_s,acc_t_s=svm_model(df,'english')
    acc_v_k,acc_t_k=knn_model(df,'english)
    v_avg_k+=acc_v_k
    t_avg_k+=acc_t_k
    #v_avg_s+=acc_v_s
    #t_avg_s+=acc_t_s
v_avg_k=v_avg_k/60
t_avg_k=t_avg_k/60
#v_avg_s=v_avg_s/60
#t_avg_s=t_avg_s/60
print('knn valid average accuracy:')
print(v_avg_k)
print('knn test accuracy:')
print(t_avg_k)
#print('svm valid average accuracy:')
#print(v_avg_s)
#print('svm test accuracy:')
#print(t_avg_s)

disneyland
1.0
1.0
eric church
0.9473684210526315
0.9473684210526315
fleetwood mac
0.9473684210526315
1.0
bob marley
1.0
0.9473684210526315
magic mike xxl
1.0
0.9473684210526315
ac/dc


ValueError: The number of classes has to be greater than one; got 1 class

In [58]:
twitter_df_a=to_csv('../Datasets/twitter-2016train-BD-arabic.txt')
twitter_df_a=preprocess(twitter_df_a,'arabic')

all_event_a=twitter_df_a['Event'].value_counts()
all_event_a=all_event_a.index.values
v_avg_k_a=0
t_avg_k_a=0
v_avg_s_a=0
t_avg_s_a=0
for event in all_event_a:
    df=one_event(event,twitter_df_a)
    #print(event)
    #acc_v_s,acc_t_s=svm_model(df,'arabic')
    acc_v_k_a,acc_t_k_a=knn_model(df,'arabic')
    v_avg_k_a+=acc_v_k_a
    t_avg_k_a+=acc_t_k_a
    #v_avg_s+=acc_v_s
    #t_avg_s+=acc_t_s
v_avg_k_a=v_avg_k_a/34
t_avg_k_a=t_avg_k_a/34
#v_avg_s=v_avg_s/60
#t_avg_s=t_avg_s/60
print('knn valid average accuracy:')
print(v_avg_k_a)
print('knn test accuracy:')
print(t_avg_k_a)
#print('Arabic svm valid average accuracy:')
#print(v_avg_s)
#print('Arabic svm test accuracy:')
#print(t_avg_s)

knn valid average accuracy:
0.7601485860352644
knn test accuracy:
0.7750636786320179
