In [None]:
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


In [None]:
file_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv'  
identification = pd.read_csv(file_path)

file_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv'  
emotion = pd.read_csv(file_path)

file_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/sampleSubmission.csv'  
sample = pd.read_csv(file_path)

file_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json' 
tweets = pd.read_json(file_path, lines=True)

tweets.head()

In [None]:
# Extract relevant fields
tweets['text'] = tweets['_source'].apply(lambda x: x['tweet']['text'])
tweets['hashtags'] = tweets['_source'].apply(lambda x: x['tweet']['hashtags'])
tweets['tweet_id'] = tweets['_source'].apply(lambda x: x['tweet']['tweet_id'])
print(tweets['text'])
print(tweets['hashtags'])
print(tweets['tweet_id'])

In [None]:
#data set is to large, sample first and then produce preprocess
tweets = tweets.merge(emotion, on='tweet_id', how='left', suffixes=(None, '_dup'))
# Merge tweets with identifier to get train and test splits


In [None]:
tweets.head(5)

In [None]:
tweets = tweets.merge(identification)

In [None]:
tweets.head(5)

In [None]:
# Split into train and test tweets
train_tweet = tweets[tweets['identification'] == 'train']
test_tweet = tweets[tweets['identification'] == 'test']

In [None]:
train_tweet = train_tweet.sample(frac=0.3, random_state=42)


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Ensure proper lemmatization support

In [None]:
# Preprocessing function with added checks
def preprocess_text(text):
    # Handle missing or non-string entries
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs (http://, https://, www)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove mentions (@username) and hashtags (#hashtag)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Rejoin tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
train_tweet['cleaned_text'] = train_tweet['text'].apply(preprocess_text)

# Check the results
print(tweets[['text', 'cleaned_text']].head())


In [None]:
train_tweet['cleaned_text'].head()

In [None]:
train_tweet.head(5)

In [None]:
train_tweet.head(5)

In [None]:
train_tweet.drop_duplicates(subset=['text'], keep=False, inplace=True)
train_tweet.head(5)

# TFIDF


In [None]:
y_train_data = train_tweet['emotion']
X_train_data = train_tweet.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data
)

tfidf = TfidfVectorizer(max_features=1500)
X = tfidf.fit_transform(X_train['text']).toarray()
X_test = tfidf.transform(X_test['text'])

In [None]:
le = LabelEncoder()
y = le.fit_transform(y_train)
y_test = le.transform(y_test)

clf = RandomForestClassifier()
clf.fit(X, y)

y_pred = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


X_test_data = test_tweet.drop(['tweet_id', 'identification', 'hashtags'], axis=1)
X_test_data = tfidf.transform(X_test_data['text']).toarray()

y_test_pred = clf.predict(X_test_data)

y_pred_labels = le.inverse_transform(y_test_pred)

submission = pd.DataFrame({
    'id': test_tweet['tweet_id'],
    'emotion': y_pred_labels
})

submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
X_test_data = test_tweet.drop(['tweet_id', 'identification', 'hashtags'], axis=1)
X_test_data = tfidf.transform(X_test_data['text']).toarray()

y_test_pred = clf.predict(X_test_data)

y_pred_labels = le.inverse_transform(y_test_pred)

submission = pd.DataFrame({
    'id': test_tweet['tweet_id'],
    'emotion': y_pred_labels
})

submission.to_csv('/kaggle/working/submission.csv', index=False)

# Transformer

# Bert
