In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv("../artifacts/sentiment_analysis.csv")

In [8]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Data Preprocessing

In [9]:
data.shape

(7920, 3)

In [10]:
data.duplicated().sum()

0

In [11]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

# ##Text Preprocessing

In [12]:
import re
import string

In [13]:
data["tweet"].head(5)

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

convert uppercase to lowercase

In [14]:
data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x.lower() for x in x.split())) 

In [15]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [16]:
data["tweet"].head(7)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
5    iphone software update fucked up my phone big ...
6    happy for us .. #instapic #instadaily #us #son...
Name: tweet, dtype: object

# remove links

In [17]:
data['tweet'] = data['tweet'].apply(lambda x: ' '.join(re.sub(r'^https?:\/\/.*[\r\n]*','',x, flags=re.MULTILINE) for x in x.split()))

In [18]:
data["tweet"].head(7)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
5    iphone software update fucked up my phone big ...
6    happy for us .. #instapic #instadaily #us #son...
Name: tweet, dtype: object

Remove punctuations

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
        return text

data['tweet'] = data['tweet'].apply(remove_punctuations)

In [21]:
data["tweet"].head(7)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this would you go? #talk #makememories...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service apple won't even talk to ...
5    iphone software update fucked up my phone big ...
6    happy for us .. #instapic #instadaily #us #son...
Name: tweet, dtype: object

In [22]:
data["tweet"].head(7)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this would you go? #talk #makememories...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service apple won't even talk to ...
5    iphone software update fucked up my phone big ...
6    happy for us .. #instapic #instadaily #us #son...
Name: tweet, dtype: object

In [23]:
import string

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data['tweet'] = data['tweet'].apply(remove_punctuations)


In [24]:
data["tweet"].head(7)

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
5    iphone software update fucked up my phone big ...
6    happy for us  instapic instadaily us sony xper...
Name: tweet, dtype: object

In [25]:
data["tweet"].tail(7)

7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely 90 year old neighbor with he...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

# remove numbers

In [26]:
data['tweet'] = data['tweet'].str.replace('\d+','',regex=True)

In [27]:
data["tweet"].tail(7)

7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

remove stopwords

In [28]:
!pip install nltk



In [29]:
import nltk

In [30]:
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
with open('../static/moodel/corpora/stopwords/english','r') as file:
    sw = file.read().splitlines()

FileNotFoundError: [Errno 2] No such file or directory: '../static/moodel/corpora/stopwords/english'

In [None]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()


In [None]:
sw

In [None]:
data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in sw)) 

Stemming

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer

In [None]:
data['tweet'] = data['tweet'].apply(lambda x: ' '.join(ps.stem(x) for x  in x.split())) 

In [None]:
import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

data['tweet'] = data['tweet'].apply(lambda x: ' '.join(ps.stem(word) for word in x.split()))


In [None]:
data["tweet"].head(5)

In [32]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beaut...
1,2,0,finally a transparant silicon case thanks to ...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...
...,...,...,...
7915,7916,0,live out loud lol liveoutloud selfie smile son...
7916,7917,0,we would like to wish you an amazing day make ...
7917,7918,0,helping my lovely year old neighbor with her ...
7918,7919,0,finally got my smart pocket wifi stay connecte...


# Building vocabulary

In [33]:
from collections import Counter
vocab = Counter()


In [34]:
vocab

Counter()

In [35]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [36]:
len(vocab)


18272

In [37]:
data.shape

(7920, 3)

In [38]:
 tokens = [key for key in vocab if vocab[key] > 10]

In [39]:
tokens

['android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone',
 'finally',
 'a',
 'case',
 'thanks',
 'to',
 'my',
 'yay',
 'sony',
 'xperia',
 's',
 'we',
 'love',
 'this',
 'would',
 'you',
 'go',
 'talk',
 'relax',
 'smartphone',
 'wifi',
 'connect',
 'im',
 'i',
 'know',
 'was',
 'made',
 'that',
 'way',
 'home',
 'what',
 'amazing',
 'service',
 'apple',
 'wont',
 'even',
 'me',
 'about',
 'have',
 'pay',
 'them',
 'for',
 'their',
 'stupid',
 'support',
 'software',
 'update',
 'fucked',
 'up',
 'phone',
 'big',
 'time',
 'iphones',
 'happy',
 'us',
 'instapic',
 'instadaily',
 'xperiaz',
 'new',
 'type',
 'c',
 'charger',
 'cable',
 'uk',
 '…',
 'amazon',
 'year',
 'newyear',
 'starting',
 'technology',
 'samsunggalaxys',
 'iphonex',
 'shopping',
 'again',
 'listening',
 'music',
 'likeforlike',
 'photo',
 'fun',
 'selfie',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'hey',
 'when',
 'make',
 'ipod',
 'dont',
 'it

In [40]:
len(tokens)

1270

In [44]:
def save_vocabulary(lines, filename):
    try:
        data = '\n'.join(lines)
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(data)
    except Exception as e:
        print(f"Error while saving the file: {e}")

save_vocabulary(tokens, '../static/model/vocabulary.txt')
