Sir,
    As you asked are not providing the GloVe Model since it is over 2Gb. Rest of the things are there.

In [1]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score



In [2]:
# taking the .txt file that contains model
glove_input_file = './glove.twitter.27B/glove.twitter.27B.200d.txt'
word2vec_output_file = 'twitter.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(1193514, 200)

In [3]:
# load the Stanford GloVe model trained on twitter dataset
filename = 'twitter.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

### Load Twitter Data

In [9]:
df = pd.read_csv('./Dataset/SemEval2018-T3-train-taskA.txt', sep = '\t')

In [10]:
df.head()

Unnamed: 0,id,label,tweet
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [11]:
# checking if data is balanced
df['label'].value_counts()

0    1916
1    1901
Name: label, dtype: int64

In [12]:
df = df.drop(['id'], axis =1)

## Data Cleaning 

1.Remove URLs <br>
2.Remove usernames (mentions)<br>
3.Remove hashtags<br>
4.Remove special characters<br>


#### 1. Removing urls

In [13]:
# we will first take just the data

data = df['tweet']
df['tweet'][5]

"You're never too old for Footie Pajamas. http://t.co/ElzGqsX2yQ"

In [14]:
# cheching by removing url in one row
result = re.sub(r"http\S+", "", data[0])

In [15]:
#removing urls from entire dataset
dt = []                                            #new list
for result in data:                                #running loop for full dataset
    result = re.sub(r"http\S+", "", result)         
    dt.append(result)

In [16]:
data = np.asanyarray(dt)
df['tweet'] = data

In [17]:
df.head()

Unnamed: 0,label,tweet
0,1,Sweet United Nations video. Just in time for C...
1,1,@mrdahl87 We are rumored to have talked to Erv...
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,1,I can't breathe! was chosen as the most notabl...


#### 2. Remove usernames

In [18]:
# we will first take just the data

data = df['tweet']
df['tweet'][1]

"@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)"

In [19]:
#removing usernames from entire dataset
dt = []                                            #new list
for result in data:                                #running loop for full dataset
    result = re.sub(r"@[^\s]+[\s]?", "", result)         
    dt.append(result)

In [20]:
data = np.asanyarray(dt)
df['tweet'] = data
df.head()

Unnamed: 0,label,tweet
0,1,Sweet United Nations video. Just in time for C...
1,1,We are rumored to have talked to Erv's agent.....
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,1,I can't breathe! was chosen as the most notabl...


#### 3. Remove Hashtags

In [21]:
# we will first take just the data

data = df['tweet']
df['tweet'][0]

'Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  '

In [22]:
#removing usernames from entire dataset
dt = []                                            #new list
for result in data:                                #running loop for full dataset
    result = re.sub(r"#[^\s]+[\s]?", "", result)         
    dt.append(result)

In [23]:
data = np.asanyarray(dt)
df['tweet'] = data
df.head()

Unnamed: 0,label,tweet
0,1,Sweet United Nations video. Just in time for C...
1,1,We are rumored to have talked to Erv's agent.....
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,1,I can't breathe! was chosen as the most notabl...


#### 4. Remove special characters

In [24]:
# we will first take just the data

data = df['tweet']
df['tweet'][1]

"We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)"

In [25]:
for remove in map(lambda r: re.compile(re.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            df.loc[:, "tweet"].replace(remove, "", inplace=True)

In [26]:
df['tweet'][1]

'We are rumored to have talked to Ervs agent and the Angels asked about Ed Escobar thats hardly nothing    '

In [27]:
df.to_csv('./cleaned_data/converted_after_special_chars.csv', sep = '\t')

### Now doing word2vec embeddings for the data and appending the column that represents the overall word2vec of the sentence

#### Converting to lowercase

In [28]:
df['tweet'] = df['tweet'].str.lower()

#### Tokenizing

In [29]:
tokens = []
for i in range(len(df['tweet'])):
    temp = nltk.word_tokenize(df['tweet'][i])
    temp = np.asarray(temp)
    tokens.append(temp)

In [30]:
#tokens = np.asarray(tokens)
df['text'] = tokens
df.head()

Unnamed: 0,label,tweet,text
0,1,sweet united nations video just in time for ch...,"[sweet, united, nations, video, just, in, time..."
1,1,we are rumored to have talked to ervs agent an...,"[we, are, rumored, to, have, talked, to, ervs,..."
2,1,hey there nice to see you minnesotand winter w...,"[hey, there, nice, to, see, you, minnesotand, ..."
3,0,3 episodes left im dying over here,"[3, episodes, left, im, dying, over, here]"
4,1,i cant breathe was chosen as the most notable ...,"[i, cant, breathe, was, chosen, as, the, most,..."


#### building word2vec representation for every sentence by taking the mean of all the words

In [31]:
mean_wv = []
#i=0
length = len(model.word_vec('hello'))
for sentence in df['text']:
    #i = i + 1
    l = []
    for w in sentence:
        if w in model.wv:
            temp = model.word_vec(w)
        else:
            temp = np.zeros(length)
        l.append(temp)   
        
    l = np.asarray(l)
    #mean_wv.append(l)
    if len(l) == 0:
        m = np.zeros(length)
    else:
        m = np.mean(l, axis = 0)
    
    #print(len(l))
    mean_wv.append(m)

mean_wv = np.asarray(mean_wv)
    
#i

  


In [32]:
mean_wv.shape

(3817, 200)

In [33]:
df['mean'] = mean_wv.tolist()

In [34]:
df.head()

Unnamed: 0,label,tweet,text,mean
0,1,sweet united nations video just in time for ch...,"[sweet, united, nations, video, just, in, time...","[0.009712891653180122, 0.0635509341955185, 0.1..."
1,1,we are rumored to have talked to ervs agent an...,"[we, are, rumored, to, have, talked, to, ervs,...","[0.009882523531192228, 0.1374173654537452, 0.0..."
2,1,hey there nice to see you minnesotand winter w...,"[hey, there, nice, to, see, you, minnesotand, ...","[0.08628474579503138, 0.17179111225737465, -0...."
3,0,3 episodes left im dying over here,"[3, episodes, left, im, dying, over, here]","[0.18709842727652617, -0.023326572562967027, -..."
4,1,i cant breathe was chosen as the most notable ...,"[i, cant, breathe, was, chosen, as, the, most,...","[0.003277106210589409, 0.19845840334892273, 0...."


### Taking the features as the mean vectors we got and then training on them

In [35]:
data = np.array(df)

In [36]:
Y = data[:,:1]
X = data[:,3:]
Y=Y.astype('int')

# getting proper X
y = []
for i in range(len(X)):
    y.append(X[i][0])

y = np.asarray(y)

X = y

In [42]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2) 

In [43]:
for kernel in ('poly', 'rbf'):
    clf = SVC(kernel=kernel)
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy using ' + kernel + ' kernel : ' + str(accuracy_score(y_pred, y_test)))

  y = column_or_1d(y, warn=True)


Accuracy using poly kernel : 0.49345549738219896
Accuracy using rbf kernel : 0.6151832460732984


In [44]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy : ', acc)

cnf = confusion_matrix(y_test, y_pred)

print('\nConfusion Matrix : ')
print(cnf)

print('\nPrecission Score : ', precision_score(y_test, y_pred))

print('\nRecall Score : ', recall_score(y_test, y_pred))


f1 = f1_score(y_test, y_pred)
print('\nF1 Score : ')
print(f1)

Accuracy :  0.6151832460732984

Confusion Matrix : 
[[265 112]
 [182 205]]

Precission Score :  0.6466876971608833

Recall Score :  0.5297157622739018

F1 Score : 
0.5823863636363636


In [45]:
df.head()

Unnamed: 0,label,tweet,text,mean
0,1,sweet united nations video just in time for ch...,"[sweet, united, nations, video, just, in, time...","[0.009712891653180122, 0.0635509341955185, 0.1..."
1,1,we are rumored to have talked to ervs agent an...,"[we, are, rumored, to, have, talked, to, ervs,...","[0.009882523531192228, 0.1374173654537452, 0.0..."
2,1,hey there nice to see you minnesotand winter w...,"[hey, there, nice, to, see, you, minnesotand, ...","[0.08628474579503138, 0.17179111225737465, -0...."
3,0,3 episodes left im dying over here,"[3, episodes, left, im, dying, over, here]","[0.18709842727652617, -0.023326572562967027, -..."
4,1,i cant breathe was chosen as the most notable ...,"[i, cant, breathe, was, chosen, as, the, most,...","[0.003277106210589409, 0.19845840334892273, 0...."


#### just checking for sentences with less than or equal to 1 (is commented below)

In [46]:
# for i in df.index.values:
#     if(len(df.text[i]) <= 1):
#         print(i)

#### trying more sofisticated approach

1. first apply window approach and get max and min distance
2. use best and worst similarity measures

In [47]:
# applying max and min
final_vect = []
dist_max = []
dist_min = []

for sent in df.text:
    if(len(sent) == 0):
        final_vect.append(np.zeros(200))
        dist_max.append(0)
        dist_min.append(0)
    elif(len(sent) == 1):
        if(sent[0] in model.wv):
            temp = model.get_vector(sent[0])
            final_vect.append(temp)
            dist_min.append(np.linalg.norm(temp))
            dist_max.append(np.linalg.norm(temp))
        else:
            final_vect.append(np.zeros(200))
            dist_max.append(0)
            dist_min.append(0)
    else:
        w = np.zeros(200)
        w1 = np.zeros(200)
        i = 1
        
        while(i< len(sent)):
            c1 = 0
            c2 = 0
            v1 = np.zeros(200)
            v2 = np.zeros(200)
            
            for j in range(i):
                if(sent[j] in model.wv):
                    v1 += model.wv[sent[j]]
                    c1 += 1
                else:
                    v1 += np.zeros(200)
                    c1 += 1
            
            k = i
            while k < len(sent):
                if(sent[k] in model.wv):
                    v2 += model.wv[sent[k]]
                    k += 1
                    c2 += 1
                else:
                    v2 += np.zeros(200)
                    k += 1
                    c2 += 1
            v1 = v1 / c1
            v2 = v2 / c2

            if i == 1:
                w = v1 - v2
                w1 = w
            else:
                if(np.linalg.norm(w) < np.linalg.norm(v1 - v2)):
                    w = v1 - v2 
                else:
                    w1 = v1 - v2
                    
            #print(i)
            i += 1
        final_vect.append(w)
        dist_max.append(np.linalg.norm(w))
        dist_min.append(np.linalg.norm(w1))
    

  if sys.path[0] == '':


### Now final_vect has the max distant vectors and dist_max and dist_min have the maximum and minimum distances

In [48]:
#converting the lists to arrays
final_features = np.asarray(final_vect)
max_dist = np.asarray(dist_max).reshape(-1,1)
min_dist = np.asarray(dist_min).reshape(-1,1)

In [49]:
#  stacking above calculated features together

X = np.hstack((final_features, max_dist, min_dist))

In [50]:
Y = np.array(df.label)

In [51]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33)

In [53]:
clf = SVC()
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [54]:
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy : ', acc)

cnf = confusion_matrix(y_test, y_pred)

print('\nConfusion Matrix : ')
print(cnf)

print('\nPrecission Score : ', precision_score(y_test, y_pred))

print('\nRecall Score : ', recall_score(y_test, y_pred))


f1 = f1_score(y_test, y_pred)
print('\nF1 Score : ')
print(f1)

Accuracy :  0.5388888888888889

Confusion Matrix : 
[[390 233]
 [348 289]]

Precission Score :  0.553639846743295

Recall Score :  0.45368916797488223

F1 Score : 
0.4987057808455565


#### We will now try to use the similarity measure

In [55]:
# applying max and min
best_sim = []
sec_best_sim = []

worst_sim = []
sec_worst_sim = []

for sent in df.text:
    if(len(sent) == 0):
        
        best_sim.append(0)
        sec_best_sim.append(0)
        
        worst_sim.append(0)
        sec_worst_sim.append(0)
        
    elif(len(sent) == 1):
        if(sent[0] in model.wv):                         #if the word in vocab            
            best_sim.append(1)
            sec_best_sim.append(1)
            
            worst_sim.append(1)
            sec_worst_sim.append(1)
        else:                                            #if word not in vocab
            best_sim.append(1)
            sec_best_sim.append(1)
            
            worst_sim.append(1)
            sec_worst_sim.append(1)
    else:
        i = 0
        max1 = 0
        max2 = 0
        min1 = 1
        min2 = 1
        while(i< len(sent) - 1):
            j = i + 1
            while( j < len(sent) ):
                if( sent[i] in model.wv and sent[j] in model.wv ):
                    temp = model.similarity(sent[i], sent[j])
                    if(temp > max1):
                        max2 = max1
                        max1 = temp
                    if(temp < min1):
                        min2 = min1
                        min1 = temp
                
                j += 1
            
            i += 1
            
           
        best_sim.append(max1)
        sec_best_sim.append(max2)
        
        worst_sim.append(min1)
        sec_worst_sim.append(min2)



Converting lists to arrays

In [56]:
best_sim = np.array(best_sim).reshape(-1,1)
sec_best_sim = np.array(sec_best_sim).reshape(-1,1)

worst_sim = np.array(worst_sim).reshape(-1,1)
sec_worst_sim = np.array(sec_worst_sim).reshape(-1,1)

In [57]:
#  stacking above calculated features together

X = np.hstack((best_sim, sec_best_sim, worst_sim, sec_worst_sim))

In [58]:
Y = np.array(df.label)

In [70]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [71]:
clf = SVC(C = 10)
clf.fit(x_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [73]:
y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy : ', acc)

cnf = confusion_matrix(y_test, y_pred)


print('\nPrecission Score : ', precision_score(y_test, y_pred))

print('\nRecall Score : ', recall_score(y_test, y_pred))


f1 = f1_score(y_test, y_pred)
print('\nF1 Score : ')
print(f1)

Accuracy :  0.5274869109947644

Precission Score :  0.5200642054574639

Recall Score :  0.8393782383419689

F1 Score : 
0.6422200198216056


#### We saw that 1st and 3rd approaches did well


So combining features from the two

In [211]:
X_means = data[:,3:]

# getting proper X
y = []
for i in range(len(X)):
    y.append(X_means[i][0])

y = np.asarray(y)

X_means = y

In [234]:
train_data = np.hstack((X_means, X))
train_labels = Y
train_labels.shape

(3817,)

In [342]:
x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size = 0.2)

In [343]:
clf = SVC(C = 20)
clf.fit(x_train, y_train)

SVC(C=20, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [344]:
y_pred = clf.predict(x_test)

In [345]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy : ', acc)

cnf = confusion_matrix(y_test, y_pred)

print('\nConfusion Matrix : ')
print(cnf)

print('\nPrecission Score : ', precision_score(y_test, y_pred))

print('\nRecall Score : ', recall_score(y_test, y_pred))


f1 = f1_score(y_test, y_pred)
print('\nF1 Score : ')
print(f1)

Accuracy :  0.6740837696335078

Confusion Matrix : 
[[268 121]
 [128 247]]

Precission Score :  0.6711956521739131

Recall Score :  0.6586666666666666

F1 Score : 
0.6648721399730821
