In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the dataset

In [None]:
columns  = ["sentiment", "ids", "date", "flag", "user", "text"]
df= pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", names = columns)
df.head()

In [None]:
df.shape

Sampling the dataset

In [None]:
df_sampled = df.groupby('sentiment')[['sentiment', 'text']].sample(n=50000, random_state=1)
df_sampled.reset_index(inplace=True)
df_sampled

In [None]:
df_sampled.groupby(['sentiment']).size().plot(kind='bar')

In [None]:
df_sampled.isnull().sum()

WordCloud

In [None]:
!pip install wordcloud

In [None]:
# from wordcloud import WordCloud
# plt.figure(figsize=(20, 10))

# # Combine all text entries from the 'text' column of your dataframe into a single string
# text_combined = " ".join(cat for cat in df_sampled.text)
# word_cloud = WordCloud(
#     collocations=False, 
#     background_color='white', 
#     width=2000, 
#     height=1000
# ).generate(text_combined)

# # Display the generated Word Cloud
# plt.imshow(word_cloud, interpolation='bilinear')
# plt.axis("off")  # Turn off the axis numbers and labels
# plt.show()  # Display the plot

Preprocessing 

1. Lower case
2. Removing urls
3. Removing usernames
4. Replace emojis
5. Remove the chat words and numbers(e.g lol to laugh out loud , 1 to one)
6. replace contractions
7. Remove punctuations
8. Lemmatization and replace consecutive letters
9. Remove stopwords

In [None]:
from num2words import num2words
slangDf = pd.read_csv("slang.csv")
slangDf=slangDf[['acronym','expansion']]
slangDf.head()


In [None]:
def replace_chat_words(text):
    normal_word=slangDf[slangDf['acronym'].isin([text])]['expansion'].values
    if len(normal_word)>=1:
        if text=='lol':
            return normal_word[1]
        else:
            return normal_word[0]
    elif text.isnumeric():
        return num2words(text)
    else:
        return text
    
replace_chat_words('lol')

In [None]:
import re
import emoji
import contractions as con
import string
import en_core_web_lg
# pip install spacy
# python -m spacy download en_core_web_lg
from autocorrect import Speller

nlp=en_core_web_lg.load()
speller=Speller(lang='en')
stop_words=nlp.Defaults.stop_words

def preprocessingText(text):
  text = text.lower()
  # Remove urls
  text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
  # # Remove usernames
  text = re.sub(r'@[^\s]+','', text)
  # # Replace all emojis from the emoji shortcodes
  text = emoji.demojize(text)
  # # Replace chat words and numbers
  text = " ".join([replace_chat_words(word) for word in text.split()])
  # Replace contraction words
  text=con.fix(text)
  # Remove punctuations
  text = "".join([i for i in text if i not in string.punctuation])
  # Replace 3 or more consecutive letters by 1 letter and lemmatizing the words
  text = " ".join([re.sub(r"(.)\1\1+", r"\1", str(token)) if token.pos_ in ["PROPN", 'NOUN'] else token.lemma_ for token in nlp(text)])
  # Replace misspelled words
  text=speller(text)
  # Remove stopwords
  text = " ".join([word for word in text.split() if word not in stop_words])

  text = text.strip()

  return text


In [None]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext

conf=SparkConf()
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
sc=SparkContext(conf=conf)
sqlContext=SQLContext(sc)


In [None]:
df=sqlContext.read.csv('training.1600000.processed.noemoticon.csv',header=True)
df=df.rdd
df=df.map(lambda x:(x[0],x[5]))
df_processed=df.map(lambda x:(0 if x[0]=='0' else 1,preprocessingText(x[1])))


In [None]:
df_processed=df_processed.toDF(["sentiment", "text"])

In [None]:
df_processed.toPandas()