### Importing Dependencies

In [26]:
import numpy as np
import pandas as pd
import re
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Importing Twitter sentiment dataset

In [22]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('../twitter_data.csv', names=column_names, encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [23]:
twitter_data.isnull().sum()
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [24]:
#convert target "4" to "1" which labels positive
twitter_data.replace( {'target' : {4:1}}, inplace=True)

##### 1 -> Positive Tweet and 0 -> Negative Tweet

In [25]:
def lemmatization(content):
    lemmatizer = WordNetLemmatizer()
    content = re.sub('[^a-zA-Z]', ' ', content)  # Remove non-alphabetic characters
    content = content.lower()  # Convert to lowercase
    tokens = content.split()  # Tokenize
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    return ' '.join(lemmatized_tokens)  # Return as a single string

In [None]:
twitter_data['lemmatized_content'] = twitter_data['text'].apply(lemmatization)

In [27]:
twitter_data = pd.read_csv('../twitter_data_cleaned.csv', encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,lemmatized_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [19]:
twitter_data['lemmatized_content'].head()

0    switchfoot http twitpic com zl awww bummer sho...
1    upset update facebook texting might cry result...
2    kenichan dived many time ball managed save res...
3                      whole body feel itchy like fire
4                     nationwideclass behaving mad see
Name: lemmatized_content, dtype: object

In [20]:
twitter_data['target'].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

### Splitting of data to train and test

In [29]:
x = twitter_data['lemmatized_content']
y = twitter_data['target'].astype(str)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print(x_train.shape, x_test.shape)

(1280000,) (320000,)


In [None]:
!git add .
!git commit -m "add train_test_split of dataset"
!git push -u origin main