In [10]:
#Load in all the necessary libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import nltk
import re

#stop words is used to remove words such as the, a, this etc.
from nltk.corpus import stopwords
nltk.download('stopwords')

nltk.download('wordnet')

#Stemming reduces words into there most basic form for example 'running ran run' the stem is run
from nltk.stem import PorterStemmer, WordNetLemmatizer 

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\S00185812\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\S00185812\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Read the data into a dataframe
df = pd.read_csv("Data/Twitter.csv")

#Examine the first couple of rows of the dataframe
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
#Renaming the columns to something more fitting and clean
df = df.rename(columns={'clean_text': 'text', 'category': 'sentiment'})

df.head()

Unnamed: 0,text,sentiment
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
#Drop empty rows
df = df.dropna()

#Get how many rows and cols are there in the dataframe
df.shape
#As we can see there are over 150,000 tweets in the database

(162969, 2)

In [5]:
#Renaming the values of -1, 0, and 1 to Negative, Neutral and Positive to make more sense when im manipulating the list
df['sentiment'] = df['sentiment'].map({-1: 'Negative', 0: 'Neutral', 1: 'Positive'})  

df.head()

Unnamed: 0,text,sentiment
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [12]:
#store the english stop words
stopWords = stopwords.words('english')

#Here I am just taking the 2 parameters we need from the dataframe
#the tweet will be used for the input of our NLP model
tweets = df['text']
#the sentiment will be used for the output of our model
sentiments = df['sentiment']

#In theory the model should take an inputted tweet and can output what kind of sentiment it will have

#function to take in a tweet clean it for processing and output it again
def cleanTweet(tweet):
    
    #convert tweet to lowercase
    tweet = tweet.lower()
    
    #removes any character not alphabetic or numeroc
    tweet = re.sub(r"[^A-Za-z0-9]",' ', tweet)
    
    
    #removes the stopwords and kemmatizes the remaing word so we get the root word
    tweet = [WordNetLemmatizer().lemmatize(word) for word in tweet.split(' ') if ((word not in stopWords) & len(word)!=0)]
    
    #return the tweet
    return ' '.join(tweet)

#for each tweet in the list clean it
tweets = tweets.apply(cleanTweet) 

df['text'] = tweets

df.head()

Unnamed: 0,text,sentiment
0,minimum maximum begin difficult job reforming ...,Negative
1,drama,Neutral
2,say welcome bjp rahul think relax,Positive
3,chowkidar great service confusion crustal clea...,Positive
4,among world today trump putin may,Positive
