In [3]:
import pandas as pd
from scipy.io import loadmat
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from utilities import remove_empty_tweets


In [2]:
train_data_path = 'cleaned_data/cleaned_train_data_for_subtask1.csv'
test_data_path = 'cleaned_data/cleaned_test_data_for_subtask1.csv'
#read files.
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print("Train set:"% train_data.columns, train_data.shape, len(train_data)) 
print("Test set:"% test_data.columns, test_data.shape, len(test_data)) 



Train set: (20974, 8) 20974
Test set: (4997, 8) 4997


In [4]:
train_data = remove_empty_tweets(train_data, "#2_tweet_clean_V1")
test = remove_empty_tweets(test_data, "#2_tweet_clean_V1")

train_data.head()

Unnamed: 0,#1_tweetid,#2_tweet,#3_country_label,#2_tweet_clean_V0,#2_tweet_clean_V1,#2_tweet_clean_V2,#2_tweet_clean_V3,#classes_id
0,TRAIN_0,حاجة حلوة اكيد,Egypt,حاجة حلوة اكيد,حاجه حلوه اكيد,حاجه حلوه اكيد,حاجه حلوه اكيد,0
1,TRAIN_1,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,Iraq,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,عم بشتغلوا لشعب الاميركي اما نحن يكذبوا ويغشوا...,عم بشتغلوا لشعب الاميركي يكذبوا ويغشوا ويسرقوا...,عم بشتغلوا لشعب الاميركي يكذبوا ويغشوا ويسرقوا...,1
2,TRAIN_2,ابشر طال عمرك,Saudi_Arabia,ابشر طال عمرك,ابشر طال عمرك,ابشر طال عمرك,ابشر طال عمرك,2
3,TRAIN_3,منطق 2017: أنا والغريب علي إبن عمي وأنا والغري...,Mauritania,منطق أنا والغريب علي إبن عمي وأنا والغريب وإب...,منطق انا والغريب علي ابن عمي وانا والغريب وابن...,منطق والغريب ابن عمي وانا والغريب وابن عمي اخو...,منطق والغريب ابن عمي وانا وابن اخويا قطع العلا...,3
4,TRAIN_4,شهرين وتروح والباقي غير صيف ملينا,Algeria,شهرين وتروح والباقي غير صيف ملينا,شهرين وتروح والباقي غير صيف ملينا,شهرين وتروح والباقي صيف ملينا,شهرين وتروح والباقي صيف ملينا,4


In [5]:
#prepare train and test data.
X_train = train_data['#2_tweet_clean_V1'].tolist()
y_train = train_data['#classes_id'].tolist()
X_test = test_data['#2_tweet_clean_V1'].tolist()
y_test = test_data['#classes_id'].tolist()


In [13]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(train_data['#2_tweet_clean_V1'],train_data['#classes_id'],test_size=0.3, random_state=42)

print(len(x_train),len(x_valid))

14681 6293


In [15]:
#Create vocabulary

Tfidf_vect = TfidfVectorizer(max_features=5000, ngram_range=(1,3), stop_words=STOPWORDS)
Tfidf_vect.fit(train_data['#2_tweet_clean_V1'])
Train_X_Tfidf = Tfidf_vect.transform(x_train)
Test_X_Tfidf = Tfidf_vect.transform(x_valid)

print(Tfidf_vect.vocabulary_)

{'حاجه': 2001, 'حلوه': 2127, 'اكيد': 350, 'حاجه حلوه': 2002, 'عم': 2977, 'اما': 1128, 'نحن': 4181, 'طال': 2786, 'عمرك': 2982, 'طال عمرك': 2787, 'انا': 1165, 'علي': 2961, 'ابن': 19, 'عمي': 2997, 'وانا': 4451, 'وابن': 4378, 'اخويا': 134, 'قطع': 3287, 'العلاقات': 795, 'مع': 3915, 'قطر': 3286, 'موريتانيا': 4116, 'شهرين': 2665, 'والباقي': 4417, 'غير': 3074, 'صيف': 2768, 'ملينا': 4008, 'واله': 4431, 'ما': 3660, 'حد': 2049, 'ولا': 4603, 'منك': 4081, 'انتى': 1222, 'اساسا': 181, 'واله ما': 4442, 'ما حد': 3672, 'نفس': 4221, 'الوقت': 1086, 'على': 2941, 'ان': 1156, 'اي': 1312, 'هدف': 4303, 'دايما': 2248, 'عشان': 2902, 'نفسنا': 4225, 'الفاضي': 834, 'نفس الوقت': 4222, 'خرا': 2179, 'بقا': 1561, 'علشان': 2937, 'عليكي': 2970, 'نفسك': 4224, 'دخل': 2249, 'هو': 4347, 'حلو': 2124, 'بس': 1490, 'ليك': 3649, 'عدنا': 2880, 'حلو بس': 2126, 'بصراحه': 1520, 'مسلسل': 3872, 'هيك': 4372, 'بتذكر': 1423, 'كان': 3342, 'اسمها': 209, 'في': 3167, 'احمد': 99, 'ربي': 2383, 'كي': 3475, 'يجي': 4802, 'واحد': 4385, 'منهم': 4086

In [26]:
test_X_Tfidf = Tfidf_vect.transform(X_test)

In [27]:
print(test_X_Tfidf)


  (0, 4686)	0.2725975949915372
  (0, 3508)	0.6415376866873623
  (0, 3490)	0.6003354275927336
  (0, 2961)	0.3920680070366711
  (2, 4834)	0.4806093873555371
  (2, 4686)	0.22683678440777136
  (2, 4624)	0.538938446041737
  (2, 2380)	0.36145456538965853
  (2, 919)	0.5444773994964055
  (3, 4385)	0.3511588430380518
  (3, 3610)	0.3744236716461203
  (3, 3216)	0.5554519543228398
  (3, 3013)	0.45440883178969727
  (3, 2684)	0.33050102053802594
  (3, 1312)	0.3350361122640352
  (4, 4364)	0.59789450637883
  (4, 2616)	0.6830940277353629
  (4, 2039)	0.4194099528078635
  (5, 4127)	0.4891438736639621
  (5, 2702)	0.4982152879755282
  (5, 2235)	0.3306280683541731
  (5, 1347)	0.5220759805696495
  (5, 1343)	0.2961330090818576
  (5, 1165)	0.20723607201746264
  (6, 4136)	0.7065462430350279
  :	:
  (4991, 1490)	0.36921557159185053
  (4991, 1165)	0.33016595533171134
  (4992, 3863)	0.6112193293814701
  (4992, 3167)	0.29606537363679203
  (4992, 692)	0.7340001538990116
  (4993, 4601)	0.43970363937125917
  (4993, 44

#  Radial Basis Function  (RBF) kernel

In [29]:
import timeit

start = timeit.default_timer()

SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=2)
SVM.fit(Train_X_Tfidf,y_train)

stop = timeit.default_timer()
print('Train time: ', stop - start)  

# predict the labels on validation dataset
predictions_SVM_valid = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_valid, y_valid)*100)



Train time:  32.19084000000112
SVM Accuracy Score ->  31.860797711743206


In [30]:

predictions_SVM = SVM.predict(test_X_Tfidf)

print(classification_report(predictions_SVM, y_test))

              precision    recall  f1-score   support

           0       0.91      0.31      0.47      3022
           1       0.52      0.31      0.39      1108
           2       0.24      0.26      0.25       488
           3       0.00      0.00      0.00         0
           4       0.19      0.46      0.27       177
           5       0.04      0.26      0.07        42
           6       0.04      0.19      0.07        72
           7       0.02      0.60      0.03         5
           8       0.01      0.33      0.01         3
           9       0.02      0.40      0.04        10
          10       0.00      0.00      0.00         2
          11       0.01      0.17      0.01         6
          12       0.02      0.50      0.04         4
          13       0.06      0.38      0.10        47
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.01      0.50      0.02         2
          17       0.00    

# Quadratic (poly) kernel

In [18]:
startquad = timeit.default_timer()

SVMquad = svm.SVC(C=1.0, kernel='poly', degree=3, gamma=2)
SVMquad.fit(Train_X_Tfidf,y_train)

stopquad = timeit.default_timer()
print('Train time: ', stopquad - startquad)  

# predict the labels on validation dataset
predictions_SVMquad = SVMquad.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVMquad, y_valid)*100)



Train time:  90.71464130000004
SVM Accuracy Score ->  26.473859844271413


In [32]:

predictions_SVMquad_test = SVMquad.predict(test_X_Tfidf)

print(classification_report(predictions_SVMquad_test, y_test))

              precision    recall  f1-score   support

           0       0.86      0.27      0.41      3377
           1       0.40      0.33      0.36       796
           2       0.13      0.23      0.16       289
           3       0.00      0.00      0.00         2
           4       0.13      0.36      0.19       154
           5       0.04      0.18      0.07        65
           6       0.04      0.16      0.06        90
           7       0.02      0.19      0.03        16
           8       0.01      0.25      0.02         8
           9       0.02      0.16      0.04        31
          10       0.00      0.00      0.00         2
          11       0.01      0.06      0.01        17
          12       0.02      0.22      0.04         9
          13       0.06      0.22      0.10        89
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.02      0.29      0.04         7
          17       0.01    

In [23]:
print(classification_report(predictions_SVMquad, y_valid))

              precision    recall  f1-score   support

           0       0.86      0.25      0.38      4368
           1       0.39      0.35      0.37       942
           2       0.09      0.21      0.13       261
           3       0.00      0.00      0.00         0
           4       0.19      0.43      0.26       247
           5       0.03      0.13      0.04        76
           6       0.06      0.22      0.09       115
           7       0.05      0.37      0.09        35
           8       0.01      0.18      0.03        17
           9       0.03      0.24      0.05        34
          10       0.03      0.33      0.06         6
          11       0.01      0.08      0.02        26
          12       0.04      0.38      0.08        13
          13       0.07      0.24      0.11       109
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         9
          17       0.02    

  _warn_prf(average, modifier, msg_start, len(result))


# Linear kernel

In [19]:
startlin = timeit.default_timer()

SVMlin = svm.SVC(C=1.0, kernel='linear', degree=3, gamma=2)
SVMlin.fit(Train_X_Tfidf,y_train)

stoplin = timeit.default_timer()
print('Train time: ', stoplin - startlin)  

# predict the labels on validation dataset
predictions_SVMlin = SVMlin.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVMlin, y_valid)*100)



Train time:  55.940753599999994
SVM Accuracy Score ->  37.86747179405689


In [33]:

predictions_SVMlin_test = SVMlin.predict(test_X_Tfidf)

print(classification_report(predictions_SVMlin_test, y_test))

              precision    recall  f1-score   support

           0       0.84      0.43      0.57      2034
           1       0.51      0.32      0.39      1035
           2       0.41      0.25      0.31       844
           3       0.11      0.86      0.20         7
           4       0.35      0.40      0.37       384
           5       0.12      0.25      0.16       132
           6       0.17      0.22      0.19       269
           7       0.06      0.29      0.10        35
           8       0.03      0.20      0.05        25
           9       0.05      0.43      0.09        23
          10       0.00      0.00      0.00         5
          11       0.02      0.12      0.03        24
          12       0.03      0.60      0.05         5
          13       0.19      0.40      0.26       151
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         0
          16       0.04      1.00      0.07         4
          17       0.04    

In [24]:
print(classification_report(predictions_SVMlin, y_valid))

              precision    recall  f1-score   support

           0       0.85      0.42      0.56      2555
           1       0.54      0.35      0.42      1261
           2       0.41      0.26      0.32       990
           3       0.13      1.00      0.23        10
           4       0.43      0.46      0.44       525
           5       0.14      0.35      0.20       164
           6       0.20      0.29      0.24       311
           7       0.20      0.60      0.30        87
           8       0.09      0.59      0.16        32
           9       0.08      0.50      0.14        42
          10       0.29      0.82      0.43        22
          11       0.03      0.29      0.05        21
          12       0.01      0.33      0.02         3
          13       0.25      0.39      0.31       237
          14       0.03      0.22      0.05         9
          15       0.00      0.00      0.00         0
          16       0.01      0.50      0.01         2
          17       0.02    

# Sigmoid kernel

In [20]:
startsig = timeit.default_timer()

SVMsig = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma=2)
SVMsig.fit(Train_X_Tfidf,y_train)

stopsig = timeit.default_timer()
print('Train time: ', stopsig - startsig)  

# predict the labels on validation dataset
predictions_SVMsig = SVMsig.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVMsig, y_valid)*100)



Train time:  40.608465600000045
SVM Accuracy Score ->  36.437311298267915


In [25]:
print(classification_report(predictions_SVMsig, y_valid))

              precision    recall  f1-score   support

           0       0.83      0.42      0.56      2433
           1       0.50      0.32      0.39      1268
           2       0.39      0.25      0.30       976
           3       0.13      1.00      0.23        10
           4       0.41      0.43      0.42       526
           5       0.13      0.29      0.18       181
           6       0.19      0.25      0.22       342
           7       0.20      0.56      0.30        96
           8       0.12      0.43      0.19        58
           9       0.09      0.36      0.14        64
          10       0.32      0.80      0.46        25
          11       0.04      0.24      0.07        34
          12       0.01      0.20      0.02         5
          13       0.23      0.38      0.29       232
          14       0.03      0.22      0.05         9
          15       0.00      0.00      0.00         0
          16       0.01      0.17      0.01         6
          17       0.02    