# Data Loading

In [21]:
import pandas as pd
# Reading JSON file with lines=True
df = pd.read_csv("Sarcasm.csv")
# Display the dataframe
df = df[['tweet','sarcastic']]
df.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [9]:
df.dropna(inplace=True)


# Data Cleaning

In [12]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from contractions import fix  # To handle contractions like don't -> do not

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text_advanced(text):
    # Expand contractions
    text = fix(text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to a single string
    cleaned_text = ' '.join(cleaned_tokens)
    
    # Remove excessive whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['tweet'] = df['tweet'].astype(str).apply(clean_text_advanced)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
df

Unnamed: 0,tweet,sarcastic
0,thing got college caffeine addiction,1
1,love professor draw big question mark next ans...,1
2,remember hundred email company covid started g...,1
3,today poppop told “ forced ” go college 🙃 okay...,1
4,volphancarol littlewhitty mysticalmanatee also...,1
...,...,...
3463,population spike chicago 9 month ridiculous,0
3464,would think second last english class year pro...,0
3465,finally surfacing holiday scotland difficult d...,0
3466,could prouder today well done every student go...,0
