In [52]:
#Importing the libraries
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import nltk

In [53]:
#Importing the data
data = pd.read_csv("tweets.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [54]:
#Preprocessing
import string
import re
def remove_punctuation_of_text(text):
  punctuation_free = ''.join([i for i in text if i not in string.punctuation])
  #replacing links with empty string using regex
  no_links = re.sub(r'http\S+','',punctuation_free)
  return no_links

nltk.download('punkt')
def tokenization(text):
  tokens = nltk.word_tokenize(text)
  return tokens

nltk.download('stopwords')
stop_words= nltk.corpus.stopwords.words('english')
def remove_stop_words(text):
  no_stop_words = [i for i in text if i not in stop_words]
  return no_stop_words

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
def lemmatized_text(text):
  lemma = [wordnet_lem.lemmatize(word) for word in text]
  return lemma


def preprocess(column):
  corpus = []
  for item in column:
    new_item = remove_punctuation_of_text(item)
    new_item = new_item.lower()
    new_item = tokenization(new_item)
    new_item = remove_stop_words(new_item)
    new_item = lemmatized_text(new_item)
    corpus.append(new_item)
  return corpus

#Dropping the id column
# data.drop('id', axis=1, inplace = True)

#Adding the preprocessed text as a new column to the data df
data['processed'] = preprocess(data['tweet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [55]:
pd.set_option('display.max_colwidth',None)
data.head()

Unnamed: 0,id,label,tweet,processed
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,"[finally, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias…]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,"[love, would, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,"[im, wired, know, im, george, made, way, iphone, cute, daventry, home]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,"[amazing, service, apple, wont, even, talk, question, unless, pay, 1995, stupid, support]"


### CBOW

In [56]:
# Modelling

from sklearn.model_selection import train_test_split
import gensim
#splitting to x and y
x = data['processed']
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

# creating Word2Vec model
word2vec_model = gensim.models.Word2Vec(x_train, min_count=1)

#assigning the words to word variable from the word2vec
words = word2vec_model.wv.index_to_key

#assigning the vectors for each word in the train and test set
train_vec = np.array([np.array(word2vec_model.wv[i] for i in item if i in words) for item in x_train], dtype=object)
test_vec = np.array([np.array(word2vec_model.wv[i] for i in item if i in words) for item in x_test], dtype=object)

#assigning a zero vector for words that are not in the word2vec vector and taking the mean for words that are in word2vec
#applying the same process for training and testing data



In [57]:
train_vec.ndim

1

In [58]:
x_train_vec_avg = [np.mean(v, axis=0) if v.ndim > 0 and v.size > 0 else np.zeros(100, dtype=float) for v in train_vec]
x_test_vec_avg = [np.mean(v, axis=0) if v.ndim > 0 and v.size > 0 else np.zeros(100, dtype=float) for v in test_vec]


In [59]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train_vec_avg, y_train)

In [60]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train)

0.7484217171717171

In [61]:
y_pred = clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test)

0.7272727272727273

In [62]:
sms = "What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!"
sms = preprocess(sms)


valid_words = [word2vec_model.wv[i] for i in sms if i in words]
x_test1 = np.array(valid_words).mean(axis=0) if valid_words else np.zeros(100, dtype=float)
x_test1 = x_test1.reshape(1, -1) if not np.isnan(x_test1).any() else np.zeros((1, 100), dtype=float)

y_pred = clf.predict(x_test1)
print(y_pred)

[0]


### Bag of Words

In [67]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
corpus = [' '.join(doc) for doc in preprocess(data['tweet'])]
train_data = cv.fit_transform(corpus)
x= train_data
y = data['label']


In [70]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.2)
clf.fit(x_train,y_train)




In [71]:

from sklearn.metrics import accuracy_score

y_pred = clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.8566919191919192

### LSTM

In [98]:
#tokenization

from keras.preprocessing import text
tokenizer = text.Tokenizer()

tokenizer.fit_on_texts(list(data['tweet']))

tokenized_text = tokenizer.texts_to_sequences(data['tweet'])


In [99]:
len(data['tweet'][0])



128

In [100]:
len(tokenized_text[0])

16

In [101]:
# to avoid the problem with differrent sizes

from keras.utils import pad_sequences

x = pad_sequences(tokenized_text, maxlen=100)

In [None]:
# to get all the words

tokenizer.word_index

In [103]:
len(tokenizer.word_index)

24171

In [105]:
# splitting the data to x and y

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, data['label'].values, test_size=0.2)

In [106]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SimpleRNN, Dropout

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim = 128, input_length=100))
model.add(LSTM(10))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [107]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [108]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          3094016   
                                                                 
 lstm (LSTM)                 (None, 10)                5560      
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 50)                550       
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 3100177 (11.83 MB)
Trainable params: 31001

In [109]:
model.fit(x_train, y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ce990565990>

In [110]:
y_preds = model.predict(x_test)



In [111]:
y_preds[y_preds > 0.5] = 1
y_preds[y_preds<0.5] = 0

In [113]:
accuracy_score( y_preds,y_test)

0.8472222222222222

In [114]:
test1 = "What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!"
test1 = tokenizer.texts_to_sequences([test1])
test2 = pad_sequences(test1, maxlen=100)
output = model.predict(test2)
output[output>0.5] = 1
output[output < 0.5] = 0
output



array([[1.]], dtype=float32)

LSTM has better accuracy