In [1]:
# Python 3+

# 3rd party imports (not present in the standard python library)
# To install, pip install numpy pandas

import numpy as np
import pandas as pd

# Standard python library imports

import glob

In [2]:
# A large dataset with 1.6 million tweets are being used to train the model
# Due to its size, the file is not included in this repository
# The dataset can be downloaded from https://www.kaggle.com/kazanova/sentiment140

# File in current workspace
glob.glob('*.csv')

['training.1600000.processed.noemoticon.csv']

In [3]:
# Import data

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1', names = ["Score", "Id", "Date", "Flag", "User", "Tweet"])

In [4]:
# First 5 records

df.head()

Unnamed: 0,Score,Id,Date,Flag,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# To train the model, our primary data points are the tweet and the score assoociated with the score
# Score here is the sentiment where 0 = negative, 4 = positive
# Columns that are not required are removed and the score is normalized to be in the 0 - 1 range

df.drop(["Id", "Date", "Flag", "User"], axis = 1, inplace = True)
df['Score'] = df['Score'].apply(lambda i : i / 4)

In [7]:
df.head()

Unnamed: 0,Score,Tweet
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."


In [1]:
df['Tweet'][0]

NameError: name 'df' is not defined

In [10]:
## Tweet cleanup (this process takes a significant amount of time)
# Removing stop words, @ mentions, webpages and special characters

from nltk.corpus import stopwords # nltk.download('stopwords') before importing
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean(tweet):
    stage1 = [word for word in tweet.lower().split() if word not in stopwords.words('english')] # stopword removal
    stage2 = [word[1:] if word.startswith('#') else word for word in stage1] # Hashtag symbol removal
    stage3 = [stemmer.stem(word) for word in stage2 if not any([word.startswith('@'), word.startswith('http'), word.startswith('www')])] # @ mentions and websites removal and stemming
    return ' '.join(stage3)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

In [25]:
%%time

df['TweetStripped'] = df['Tweet'].apply(clean)

Wall time: 43min 41s


In [26]:
df.head()

Unnamed: 0,Score,Tweet,TweetStripped
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that' bummer. shoulda got david carr t..."
1,0.0,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
2,0.0,@Kenichan I dived many times for the ball. Man...,dive mani time ball. manag save 50% rest go bound
3,0.0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0.0,"@nationwideclass no, it's not behaving at all....","no, behav all. i'm mad. here? can't see there."
