In [3]:
import pandas as pd
import re
import numpy as np
from nltk.stem.porter import *
import nltk
from sacremoses import MosesDetokenizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB#naive bayes model

from sklearn.ensemble import RandomForestClassifier#random forest model 

from keras.preprocessing.text import Tokenizer#LSTM model
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from keras.models import Sequential
from keras import layers

In [22]:
def to_csv(address):
    data=open(address,'r',encoding='utf-8').readlines()
    ID=[]
    Type=[]
    Content=[]
    for row in data:
        ID.append(row[0:18])
        if not (row[19:26]=="neutral"):
            Type.append(row[19:27])
        else:
            Type.append(row[19:26])
        Content.append(row[28:])
    df_list={"ID":ID,"Type":Type,"Content":Content}
    twitter=pd.DataFrame(df_list,columns=['ID','Type','Content','tidy_Content'])
    return twitter

In [5]:
def remove_pattern(input_txt,pattern):
    r=re.findall(pattern, input_txt)
    for i in r:
        input_txt=re.sub(i, '', input_txt)
    return input_txt  

In [42]:
def preprocess(twitter_df,language):
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['Content'], "@[\w]*")
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['tidy_Content'], r"#(\w+)")
    twitter_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['tidy_Content'], r'http://[a-zA-Z0-9.?/&=:]*')
    if language=='english':
        twitter_df['tidy_Content']=twitter_df['tidy_Content'].str.replace("[^a-zA-Z#]", " ",regex=True)
    twitter_df['tidy_Content']=twitter_df['tidy_Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

    token_tweet=twitter_df['tidy_Content'].apply(lambda x: x.split())
    stemmer=PorterStemmer()
    token_tweet=token_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

    detokenizer=MosesDetokenizer()
    for i in range(len(token_tweet)):
        token_tweet[i]=detokenizer.detokenize(token_tweet[i], return_str=True)
    twitter_df['tidy_Content']=token_tweet
    #########################################################################################################
    twitter_df['tidy_Content']=twitter_df['tidy_Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    #########################################################################################################
    return twitter_df

In [7]:
def plot_wordcloud(twitter_df,type):
    all_words=" ".join([text for text in twitter_df['tidy_Content'][twitter_df['Type']==type]])
    wordcloud=WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [8]:
# function to collect hashtags
def hashtag_extract(df):
    hashtags = []
    # Loop over the words in the tweet
    for i in df:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    return hashtags

In [9]:
def plot_histogram(HT):
    a = nltk.FreqDist(HT)
    d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())})
    # selecting top 10 most frequent hashtags     
    d = d.nlargest(columns="Count", n = 10) 
    plt.figure(figsize=(16,5))
    ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
    ax.set(ylabel = 'Count')
    plt.show()

In [38]:
def preprocess_y(twitter_df,language):
    if language=='english':
        tfidf_vectorizer=TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
    elif language=='arabic':
        tfidf_vectorizer=TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000)
    # TF-IDF feature matrix
    tfidf=tfidf_vectorizer.fit_transform(twitter_df['tidy_Content'])
    y_validation=pd.DataFrame(columns = ['Type'])
    y_validation['Type']=twitter_df['Type'].copy()

    #1:positive   0:negative   2:neutral
    for i in range(0,y_validation['Type'].size):
        if y_validation['Type'][i]=='positive':
            y_validation['Type'][i]=1
        elif y_validation['Type'][i]=='negative':
            y_validation['Type'][i]=0
        elif y_validation['Type'][i]=='neutral':
            y_validation['Type'][i]=2
    return y_validation,tfidf

In [32]:
def naive_bayes(y_validation,tfidf):
    x_train_valid, x_test_valid, y_train_valid, y_test_valid=train_test_split(tfidf[:4800,:],y_validation['Type'][:4800].values.astype('int'), random_state=0, test_size=0.25)
    x_train, x_test,y_train,y_test=train_test_split(tfidf, y_validation['Type'].astype('int'), random_state=0, test_size=0.2)

    nb_valid=MultinomialNB()
    nb_valid.fit(x_train_valid,y_train_valid)
    print('naive bayes validation accuracy:')
    print(accuracy_score(y_test_valid,nb_valid.predict(x_test_valid)))

    nb_test=MultinomialNB()
    nb_test.fit(x_train,y_train)
    print('naive bayes test accuracy:')
    print(accuracy_score(y_test,nb_test.predict(x_test)))

In [33]:
def random_forest(y_validation,tfidf):
    x_train_valid, x_test_valid, y_train_valid, y_test_valid=train_test_split(tfidf[:4800,:], y_validation['Type'][:4800].values.astype('int'), random_state=0, test_size=0.25)
    x_train, x_test,y_train,y_test=train_test_split(tfidf, y_validation['Type'].astype('int'), random_state=0, test_size=0.2)

    rf_valid=RandomForestClassifier(n_estimators=500)
    rf_valid.fit(x_train_valid,y_train_valid)
    print('random forest valid accuracy:')
    print(accuracy_score(y_test_valid,rf_valid.predict(x_test_valid)))

    rf_test=RandomForestClassifier(n_estimators=500)
    rf_test.fit(x_train,y_train)
    print('random forest test accuracy:')
    print(accuracy_score(y_test,rf_test.predict(x_test)))

In [34]:
def data_cleaning(text_list): 
    stopwords_rem=False
    stopwords_en=stopwords.words('english')
    lemmatizer=WordNetLemmatizer()
    tokenizer=TweetTokenizer()
    reconstructed_list=[]
    for each_text in text_list: 
        lemmatized_tokens=[]
        tokens=tokenizer.tokenize(each_text.lower())
        pos_tags=pos_tag(tokens)
        for each_token, tag in pos_tags: 
            if tag.startswith('NN'): 
                pos='n'
            elif tag.startswith('VB'): 
                pos='v'
            elif tag.startswith('JJ'): 
                pos='a'
            elif tag.startswith('R'):
                pos='r'
            lemmatized_token=lemmatizer.lemmatize(each_token, pos)
            if stopwords_rem: # False 
                if lemmatized_token not in stopwords_en: 
                    lemmatized_tokens.append(lemmatized_token)
            else: 
                lemmatized_tokens.append(lemmatized_token)
        reconstructed_list.append(' '.join(lemmatized_tokens))
    return reconstructed_list

In [40]:
def LSTM_model(twitter_df,y_validation,language):
    # 将数据分解为训练集和测试集
    if language=='english':
        lstm_df=pd.DataFrame(columns=['tidy_Content'])
        lstm_df['tidy_Content']=np.vectorize(remove_pattern)(twitter_df['Content'], "@[\w]*")
        lstm_df['tidy_Content']=np.vectorize(remove_pattern)(lstm_df['tidy_Content'], r"#(\w+)")
        lstm_df['tidy_Content']=np.vectorize(remove_pattern)(lstm_df['tidy_Content'], r'http://[a-zA-Z0-9.?/&=:]*')
        lstm_df['tidy_Content']=lstm_df['tidy_Content'].str.replace("[^a-zA-Z#]", " ",regex=True)
        lstm_df['tidy_Content']=lstm_df['tidy_Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
        X_train, X_test,y_train,y_test=train_test_split(lstm_df['tidy_Content'], y_validation['Type'], random_state=0, test_size=0.2)
    elif language=='arabic':
        X_train, X_test,y_train,y_test=train_test_split(twitter_df['Content'], y_validation['Type'], random_state=0, test_size=0.2)

    # 拟合并转换数据
    X_train=data_cleaning(X_train)
    X_test=data_cleaning(X_test)
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab_size=len(tokenizer.word_index)+1
    print(f'Vocab Size: {vocab_size}')
    X_train=pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=40)
    X_test=pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=40)
    y_train=to_categorical(y_train)
    y_test=to_categorical(y_test)

    # 创建带有嵌入层的LSTM模型并拟合训练数据
    model=Sequential()
    model.add(layers.Embedding(input_dim=vocab_size,output_dim=100,input_length=40))
    model.add(layers.Bidirectional(layers.LSTM(128)))
    model.add(layers.Dense(3,activation='softmax'))
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    model.fit(X_train,y_train,batch_size=256,epochs=5,validation_data=(X_test,y_test))

In [44]:
twitter_df=to_csv('../Datasets/twitter-2016train-A.txt')
twitter_df=preprocess(twitter_df,'english')

#plot_wordcloud(twitter_df,'positive')
#plot_wordcloud(twitter_df,'negative')
#plot_wordcloud(twitter_df,'neutral')

# extracting hashtags from positive tweets
HT_positive = hashtag_extract(twitter_df['Content'][twitter_df['Type']=='positive'])
# extracting hashtags from negative tweets
HT_negative = hashtag_extract(twitter_df['Content'][twitter_df['Type']=='negative'])
# extracting hashtags from neutral tweets
HT_neutral = hashtag_extract(twitter_df['Content'][twitter_df['Type']=='neutral'])
# unnesting list
HT_positive=sum(HT_positive,[])
HT_negative=sum(HT_negative,[])
HT_neutral=sum(HT_neutral,[])
#plot_histogram(HT_positive)
#plot_histogram(HT_negative)
#plot_histogram(HT_neutral)

print('FOR ENGLISH:')
y_validation,tfidf=preprocess_y(twitter_df,'english')
naive_bayes(y_validation,tfidf)
random_forest(y_validation,tfidf)
LSTM_model(twitter_df,y_validation,'english')

naive bayes validation accuracy:
0.6041666666666666
naive bayes test accuracy:
0.5991666666666666
random forest valid accuracy:
0.6208333333333333
random forest test accuracy:
0.6091666666666666
Vocab Size: 7980
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
twitter_df_a=to_csv('../Datasets/twitter-2016train-A-arabic.txt')
twitter_df_a=preprocess(twitter_df_a,'arabic')

#plot_wordcloud(twitter_df_a,'positive')
#plot_wordcloud(twitter_df_a,'negative')
#plot_wordcloud(twitter_df_a,'neutral')

# extracting hashtags from positive tweets
HT_positive_a = hashtag_extract(twitter_df_a['Content'][twitter_df_a['Type']=='positive'])
# extracting hashtags from negative tweets
HT_negative_a = hashtag_extract(twitter_df_a['Content'][twitter_df_a['Type']=='negative'])
# extracting hashtags from neutral tweets
HT_neutral_a = hashtag_extract(twitter_df_a['Content'][twitter_df_a['Type']=='neutral'])
# unnesting list
HT_positive_a=sum(HT_positive_a,[])
HT_negative_a=sum(HT_negative_a,[])
HT_neutral_a=sum(HT_neutral_a,[])
#plot_histogram(HT_positive_a)
#plot_histogram(HT_negative_a)
#plot_histogram(HT_neutral_a)

print('FOR ARABIC:')
y_validation_a,tfidf_a=preprocess_y(twitter_df_a,'arabic')
naive_bayes(y_validation_a,tfidf_a)
random_forest(y_validation_a,tfidf_a)
LSTM_model(twitter_df_a,y_validation_a,'arabic')

naive bayes validation accuracy:
0.5983313468414779
naive bayes test accuracy:
0.5991058122205664
random forest valid accuracy:
0.5530393325387366
random forest test accuracy:
0.5827123695976155
Vocab Size: 18133
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
