In [1]:
# installing kaggle library
!pip install kaggle



Uploading kaggle.json file

In [2]:
#configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing Twitter sentiment Dataset

In [3]:
#Fetching dataset using API from kaggle
!kaggle datasets download -d kazanova/sentiment140

sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
#Extracting compressed dataset

from zipfile import ZipFile
dataset = "/content/sentiment140.zip"

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print("The dataset has been extracted successfully")

The dataset has been extracted successfully


Importing Dependencies

In [5]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#Printing english stopwords

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Processing

In [8]:
#loading data from csv to pandas dataframe

df = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1')

In [9]:
#checking number of rows and columns
df.shape

(1599999, 6)

In [10]:
#Checking our dataframe

df.sample(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1138657,4,1976861568,Sat May 30 19:20:20 PDT 2009,NO_QUERY,ferii,w0oww! shpping dayyy is awsomeeee! i alreadyy ...
592562,0,2217996017,Wed Jun 17 21:42:55 PDT 2009,NO_QUERY,Stealx,"@shannanigans13 well duh, but the problem is t..."
1320310,4,2014569341,Wed Jun 03 01:58:25 PDT 2009,NO_QUERY,Violet_MyLinh,@mileycyrus Can't wait untill Hannah Montana T...
1117592,4,1973451634,Sat May 30 11:10:25 PDT 2009,NO_QUERY,jon_kerr,"is drinking red wine ... and you're bothered,..."
213157,0,1974930693,Sat May 30 14:05:58 PDT 2009,NO_QUERY,jmarrero8,@FernandoLoo u just want to beat me up!! wome...


In [11]:
#Naming the attributes

column_names = ['target','id','date','flag','user','text']
df = pd.read_csv("/content/training.1600000.processed.noemoticon.csv",names=column_names, encoding='ISO-8859-1')

In [12]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
#checking number of rows and columns
df.shape

(1600000, 6)

In [14]:
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [15]:
#Target value distribution check
df['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [16]:
#Converting 4 to 1
df.replace({'target':{4:1}}, inplace=True)

In [17]:
#Target value distribution check
df['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

0 ---> Negative Tweet

1 ---> Positive Tweet

**Stemming**

In [18]:
port_stem = PorterStemmer()

In [19]:
def stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [20]:
df['stemmed_content'] = df['text'].apply(stemming)

In [21]:
df.to_csv('output.csv', index=False)
print("The DataFrame has been saved to 'output.csv' in the current working directory.")

The DataFrame has been saved to 'output.csv' in the current working directory.


This note ends here. Plese refer (Part-2) for further steps.