## Cleaning of the text

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import sys

In [2]:
# loading raw tweet data

PROJECT_ROOT = Path().resolve().parent
sys.path.append(str(PROJECT_ROOT))

In [3]:
from src.config import RAW_DATA_PATH, CLEANED_DATA_PATH, TEXT_COLUMN, TARGET_COLUMN

Configuration loaded successfully.


In [4]:
data = pd.read_csv(RAW_DATA_PATH)

df = data.copy()

df.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive


In [5]:
# meta_data

print ("No. of rows: ", df.shape[0])
print ("\n No. of columns: ", df.shape[1])
print ("\nName of all the columns: \n\n", df.columns.to_list())
print ("\nDatatype of the columns: \n\n", df.dtypes)
print ("The info: \n")
df.info()

No. of rows:  27481

 No. of columns:  4

Name of all the columns: 

 ['textID', 'text', 'selected_text', 'sentiment']

Datatype of the columns: 

 textID           object
text             object
selected_text    object
sentiment        object
dtype: object
The info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [8]:
df.dropna(inplace = True)

In [9]:
df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [10]:
# removing the column "selected_text"
df = df.drop(columns= ["selected_text"], errors = "ignore")

In [11]:
df.head(10)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,positive
7,50e14c0bb8,Soooo high,neutral
8,e050245fbd,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,positive


In [12]:
# Normalize target column

df[TARGET_COLUMN] = (df[TARGET_COLUMN].astype(str).str.lower().str.strip())

df = df[df[TARGET_COLUMN].isin(["positive", "negative", "neutral"])]

In [13]:
# Text Cleaning

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:

    text = text.lower()
    text = re.sub(r"http\S+", "", text)  #removing urls
    text = re.sub(r"@\w+", "", text)    # removing @
    text = re.sub(r"#\w+", "", text)    #removing #
    text = re.sub(r"[^\a-z0-9\s]", "", text)    # removing punctuation
    tokens = text.split()

    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [15]:
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str).apply(clean_text)

df = df[df[TARGET_COLUMN].str.len() > 0]

df.head(10)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"i`d responded, going",neutral
1,549e992a42,sooo sad miss san diego!!!,negative
2,088c60f138,bos bullying me...,negative
3,9642c003ef,interview! leave alone,negative
4,358bd9e861,"son ****, couldn`t put release already bought",negative
5,28b57f3990,- shameless plugging best ranger forum earth,neutral
6,6e0c6d75b1,2am feeding baby fun smile coo,positive
7,50e14c0bb8,soooo high,neutral
8,e050245fbd,,neutral
9,fc2cbefa9d,journey!? wow... u became cooler. hehe... (is ...,positive


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     27480 non-null  object
 1   text       27480 non-null  object
 2   sentiment  27480 non-null  object
dtypes: object(3)
memory usage: 858.8+ KB


In [17]:
# keeping only the required column

df_cleaned = df[[TEXT_COLUMN, TARGET_COLUMN]].copy()

df_cleaned.head(10)


Unnamed: 0,text,sentiment
0,"i`d responded, going",neutral
1,sooo sad miss san diego!!!,negative
2,bos bullying me...,negative
3,interview! leave alone,negative
4,"son ****, couldn`t put release already bought",negative
5,- shameless plugging best ranger forum earth,neutral
6,2am feeding baby fun smile coo,positive
7,soooo high,neutral
8,,neutral
9,journey!? wow... u became cooler. hehe... (is ...,positive


In [19]:
# saving cleaned data

df_cleaned.to_csv(CLEANED_DATA_PATH, index = False)

print ("Cleaned data saved to: ", CLEANED_DATA_PATH)

Cleaned data saved to:  E:\BIA\ML Class Project\Twitter_post_sentiment_analysis\data\clean_tweets.csv
