In [3]:
# Import necessary libraries

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
#Load the Data

data = pd.read_csv('/content/tweets.csv')

In [7]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [8]:
pd.set_option('display.max_colwidth', None)
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [13]:
# Text preprocessing

def clean_text(text):
  #1.Punctuation removal
  text = text.translate(str.maketrans('', '', string.punctuation))
  #2.URLs removal
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  #3.LowerCase
  text = text.lower()
  #4.White Space removal
  text = text.strip()
  return text

In [17]:
# Apply the clean_text function to the text column

data['clean_msg'] = data['tweet'].apply(clean_text)
data.head()


Unnamed: 0,id,label,tweet,clean_msg
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired i know im george i was made that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple wont even talk to me about a question i have unless i pay them 1995 for their stupid support


In [18]:
#5.Tokenization (word_tokenization)
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(text):
  words = nltk.word_tokenize(text)
  return words

data['msg_tokens'] = data['clean_msg'].apply(lambda x: tokenize(x))
data.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,id,label,tweet,clean_msg,msg_tokens
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…,"[finally, a, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, s, sonyexperias…]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired i know im george i was made that way iphone cute daventry home,"[im, wired, i, know, im, george, i, was, made, that, way, iphone, cute, daventry, home]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple wont even talk to me about a question i have unless i pay them 1995 for their stupid support,"[what, amazing, service, apple, wont, even, talk, to, me, about, a, question, i, have, unless, i, pay, them, 1995, for, their, stupid, support]"


In [19]:
#5.Remove Stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.fileids())

stopwords = nltk.corpus.stopwords.words('english')
stopwords

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
def stop_remove (text):
  stop_removed = [i for i in text if i not in stopwords]
  return stop_removed

data['no_stopwords'] = data['msg_tokens'].apply(lambda x: stop_remove(x))
data.head()

Unnamed: 0,id,label,tweet,clean_msg,msg_tokens,no_stopwords
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]","[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…,"[finally, a, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, s, sonyexperias…]","[finally, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias…]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]","[love, would, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired i know im george i was made that way iphone cute daventry home,"[im, wired, i, know, im, george, i, was, made, that, way, iphone, cute, daventry, home]","[im, wired, know, im, george, made, way, iphone, cute, daventry, home]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple wont even talk to me about a question i have unless i pay them 1995 for their stupid support,"[what, amazing, service, apple, wont, even, talk, to, me, about, a, question, i, have, unless, i, pay, them, 1995, for, their, stupid, support]","[amazing, service, apple, wont, even, talk, question, unless, pay, 1995, stupid, support]"


In [21]:
#6.Stemming

from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming(text):
  stem_text = [ps.stem(word) for word in text]
  return stem_text

data['msg_stem'] = data['no_stopwords'].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,id,label,tweet,clean_msg,msg_tokens,no_stopwords,msg_stem
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]","[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]","[fingerprint, pregnanc, test, android, app, beauti, cute, health, iger, iphoneonli, iphonesia, iphon]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…,"[finally, a, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, s, sonyexperias…]","[finally, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias…]","[final, transpar, silicon, case, thank, uncl, yay, soni, xperia, sonyexperias…]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]","[love, would, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]","[love, would, go, talk, makememori, unplug, relax, iphon, smartphon, wifi, connect]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired i know im george i was made that way iphone cute daventry home,"[im, wired, i, know, im, george, i, was, made, that, way, iphone, cute, daventry, home]","[im, wired, know, im, george, made, way, iphone, cute, daventry, home]","[im, wire, know, im, georg, made, way, iphon, cute, daventri, home]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple wont even talk to me about a question i have unless i pay them 1995 for their stupid support,"[what, amazing, service, apple, wont, even, talk, to, me, about, a, question, i, have, unless, i, pay, them, 1995, for, their, stupid, support]","[amazing, service, apple, wont, even, talk, question, unless, pay, 1995, stupid, support]","[amaz, servic, appl, wont, even, talk, question, unless, pay, 1995, stupid, support]"


In [22]:
#7.Lematization

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemm = WordNetLemmatizer()

def lemma(text):
  lemm_text = [lemm.lemmatize(x) for x in text]
  return lemm_text

data['lemma'] = data['no_stopwords'].apply(lambda x: lemma(x))
data.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,id,label,tweet,clean_msg,msg_tokens,no_stopwords,msg_stem,lemma
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]","[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]","[fingerprint, pregnanc, test, android, app, beauti, cute, health, iger, iphoneonli, iphonesia, iphon]","[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…,"[finally, a, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, s, sonyexperias…]","[finally, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias…]","[final, transpar, silicon, case, thank, uncl, yay, soni, xperia, sonyexperias…]","[finally, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias…]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]","[love, would, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]","[love, would, go, talk, makememori, unplug, relax, iphon, smartphon, wifi, connect]","[love, would, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired i know im george i was made that way iphone cute daventry home,"[im, wired, i, know, im, george, i, was, made, that, way, iphone, cute, daventry, home]","[im, wired, know, im, george, made, way, iphone, cute, daventry, home]","[im, wire, know, im, georg, made, way, iphon, cute, daventri, home]","[im, wired, know, im, george, made, way, iphone, cute, daventry, home]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple wont even talk to me about a question i have unless i pay them 1995 for their stupid support,"[what, amazing, service, apple, wont, even, talk, to, me, about, a, question, i, have, unless, i, pay, them, 1995, for, their, stupid, support]","[amazing, service, apple, wont, even, talk, question, unless, pay, 1995, stupid, support]","[amaz, servic, appl, wont, even, talk, question, unless, pay, 1995, stupid, support]","[amazing, service, apple, wont, even, talk, question, unless, pay, 1995, stupid, support]"


In [25]:
#8.Vectorization

#TF-IDF vectorizer expects a string, join the tokens into a string


data['lemmatized_tweet_str'] = data['lemma'].apply(lambda x: ' '.join(x))

In [27]:
# Split the data


X = data['lemmatized_tweet_str']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
#Vectorize the text data using TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(f'Training data shape: {X_train_tfidf.shape}')
print(f'Testing data shape: {X_test_tfidf.shape}')

Training data shape: (6336, 14965)
Testing data shape: (1584, 14965)


In [31]:
#Import LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [32]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8686868686868687