# **Data Cleaning**

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#Remove if you are not using google colab----
from google.colab import files
#-------

import io
import missingno
import seaborn as sns


**Importing data**

In [None]:
uploaded = files.upload()

Saving twitter_sentiment_data.csv to twitter_sentiment_data.csv


In [None]:
dataset =  pd.read_csv(io.BytesIO(uploaded['twitter_sentiment_data.csv']))

In [None]:
dataset.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


**Data Summary**


In [None]:
dataset.shape


(43943, 3)

In [None]:
dataset.describe()

Unnamed: 0,sentiment,tweetid
count,43943.0,43943.0
mean,0.853924,8.367966e+17
std,0.853543,8.568506e+16
min,-1.0,5.926334e+17
25%,0.0,7.970376e+17
50%,1.0,8.402301e+17
75%,1.0,9.020003e+17
max,2.0,9.667024e+17


In [None]:
dataset.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43943 entries, 0 to 43942
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  43943 non-null  int64 
 1   message    43943 non-null  object
 2   tweetid    43943 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [None]:
dataset.isna().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

There is no any missing values in code.

# **Data Cleaning**

In [None]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
corpus = []
for i in range(0, 43942):
    # Removing Hashtags
    review = re.sub(r'#', '', dataset['message'][i])
    # Removing Chines
    review = re.sub(r'[^\x00-\x7F]+', '', dataset['message'][i])
    # Removing Retweets
    review = re.sub(r'RT[\s]+', '', dataset['message'][i])
    # Removing HyperLinks
    review = re.sub(r'https?:\/\/\s+', '', dataset['message'][i])
    #selecting characters only
    review = re.sub('[^a-zA-Z]', ' ', dataset['message'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


Bag of word model for feature extraction

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[0:43942, 0].values


# **Splitting Dataset**

I am splitting dataset with 80/20 assumption.

i.e. 80% train and 20% test dataset

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

print ("x_train: ", x_train)
print ("y_train: ", y_train)


x_train:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
y_train:  [ 1 -1  1 ...  2  1  1]
