In [1]:
import pandas as pd
import re
import html
from langdetect import detect

import contractions


In [7]:
import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Data Cleaning

- Removing Emotions
- Removing URLs
- Removing unicode Characters
- Removing hashtag and mentions
- Remvoing contractions and **stopwords**
- Removing special characters


In [8]:
def remove_emojis(text):
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642" 
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_text = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def preprocess_text(text):
    # Convert HTML entities back to their original characters
    text = html.unescape(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text=contractions.fix(text)
    
    # Remove unicode characters
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#topic)
    text = re.sub(r'#\w+', '', text)
    
    #remove special Characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove emojis
    text = remove_emojis(text)

    # Convert text to lowercase
    text = text.lower()
    
    text = ' '.join(word for word in text.split() if not word.startswith('pictwittercom'))

    text=remove_stopwords(text)
    
    return text






## Considering only English texts

In [9]:

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

## Balancing Number of rows belonging to each label

In [10]:

def balance_dataframe(df, binary_column, text_column):
    # Step 1: Separate the DataFrame based on binary_column values
    df_0 = df[df[binary_column] == 0]
    df_1 = df[df[binary_column] == 1]

    # Step 2: Sort both DataFrames based on the length of text_column in descending order
    df_0 = df_0.assign(text_length=df_0[text_column].str.len())
    df_1 = df_1.assign(text_length=df_1[text_column].str.len())
    df_0 = df_0.sort_values(by='text_length', ascending=False)
    df_1 = df_1.sort_values(by='text_length', ascending=False)

    # Step 3: Determine which value has more occurrences
    num_0 = df_0.shape[0]
    num_1 = df_1.shape[0]
    if num_0 > num_1:
        # Step 4: Slice the larger DataFrame to match the number of occurrences of the less frequent value
        df_0 = df_0.iloc[:num_1]
    else:
        df_1 = df_1.iloc[:num_0]

    # Step 5: Concatenate the DataFrames back together
    df_balanced = pd.concat([df_0, df_1], ignore_index=True)

    # Drop unnecessary columns
    df_balanced = df_balanced.drop(['text_length', text_column], axis=1)

    return df_balanced


## Function For triggering pre-processing

In [11]:
def preprocess_and_balance_dataframe(df):
    # Preprocess the text in 'message' column and create 'cleaned_text' column
    df['cleaned_text'] = df['message'].apply(preprocess_text)

    # Filter rows where 'cleaned_text' is in English
    df = df[df['cleaned_text'].apply(is_english)]

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Balance the DataFrame based on 'label' column and 'message' column
    df = balance_dataframe(df, 'label', 'message')

    return df

## Running for all three datasets

In [16]:
df1=pd.read_csv('sentiment_tweets3.csv')
print(df1['label'].value_counts())
df1=df1.drop('Unnamed: 0',axis=1)
df1=preprocess_and_balance_dataframe(df1)
print(df1['label'].value_counts())
df1

label
0    8000
1    2314
Name: count, dtype: int64
label
0    1948
1    1948
Name: count, dtype: int64


Unnamed: 0,label,cleaned_text
0,0,men follow men nobody tweets
1,0,bought mh bundle create task called mario star...
2,0,brand business putting logo products services ...
3,0,assumes taboo start email lol considering repl...
4,0,great stuff thanks 4 add bounce done roll cheers
...,...,...
3891,1,wow love depression
3892,1,depression wanima
3893,1,depression came back
3894,1,depression 0beck 1


In [17]:
df2=pd.read_csv('tweets_combined.csv')
print(df2['target'].value_counts())
df2=df2.drop('Unnamed: 0',axis=1)
df2.columns=['message','label']
df2=preprocess_and_balance_dataframe(df2)
print(df2['label'].value_counts())
df2

target
0    2357
1     843
Name: count, dtype: int64
label
0    679
1    679
Name: count, dtype: int64


Unnamed: 0,label,cleaned_text
0,0,feeling depressed know alone know see hear sup...
1,0,erosion soul caused people positions power liv...
2,0,depression may carry feeling worthlessness fee...
3,0,going keep banging cos true focus get stop tel...
4,0,going keep banging cos true focus get stop tel...
...,...,...
1353,1,working new years eve
1354,1,forgot cheese cake work
1355,1,one sec
1356,1,god please help


In [18]:
df3=pd.read_excel('Twitter_Non-Advert.xlsx')
print(df3['label'].value_counts())
df3.columns=['message','label']
df3=preprocess_and_balance_dataframe(df3)
print(df3['label'].value_counts())
df3

label
1    1268
0     783
Name: count, dtype: int64
label
0    768
1    768
Name: count, dtype: int64


Unnamed: 0,label,cleaned_text
0,0,812th done wow canny believe cracking morning ...
1,0,thrilled opportunity talk mentalhealth nftart ...
2,0,manifestationmonday favorite thing mondays lau...
3,0,orange heart dog face happy international dog ...
4,0,good morning paris tokyo tower thermometer ast...
...,...,...
1531,1,many would like see mentalhealth elements heal...
1532,1,generalised anxiety disorder include worry mon...
1533,1,loss hormone allopregnanolone linked altered b...
1534,1,commitments asks candidates ratio one social w...


## Saving the combines results

In [19]:
result_concat_row = pd.concat([df1, df2, df3], axis=0)
print(result_concat_row['label'].value_counts())

result_concat_row=result_concat_row.drop_duplicates()
print(result_concat_row['label'].value_counts())

result_concat_row.to_csv('Cleaned_Tweets.csv',index=False)

label
0    3395
1    3395
Name: count, dtype: int64
label
0    3313
1    3275
Name: count, dtype: int64
