In [1]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Text, Iterable, Union
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sweth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sweth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.wordnet_lem = WordNetLemmatizer()

    def preprocess(self, TEXT: str) -> str:
        cleaned_text = self._clean(TEXT)
        text_no_stop_words = self.remove_stop_words(cleaned_text)
        text_no_freq_words = self.remove_freq_words(text_no_stop_words)
        lemmatized_text = self.lemmatize_text(text_no_freq_words)
        return lemmatized_text

    def preprocess_dataframe(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
        preprocessed_df = df.copy()
        preprocessed_df[text_column] = preprocessed_df[text_column].apply(self.preprocess)
        return preprocessed_df

    def _clean(self, TEXT: str) -> str:
      
        if not isinstance(TEXT, str):
            TEXT = str(TEXT)
        # converting to lowercase, removing URL links, special characters, punctuations...
            TEXT = TEXT.lower() # converting to lowercase
            TEXT = re.sub('https?://\S+|www\.\S+', '', TEXT) # removing URL links
            TEXT = re.sub(r"\b\d+\b", "", TEXT) # removing number 
            TEXT = re.sub('<.*?>+', '', TEXT) # removing special characters, 
            TEXT = re.sub('[%s]' % re.escape(string.punctuation), '', TEXT) # punctuations
            TEXT = re.sub('\n', '', TEXT)
            TEXT = re.sub('[’“”…]', '', TEXT)

        #removing emoji: 
            emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
            TEXT = emoji_pattern.sub(r'', TEXT)   

        # removing short form: 
            TEXT=re.sub("isn't",'is not',TEXT)
            TEXT=re.sub("he's",'he is',TEXT)
            TEXT=re.sub("wasn't",'was not',TEXT)
            TEXT=re.sub("there's",'there is',TEXT)
            TEXT=re.sub("couldn't",'could not',TEXT)
            TEXT=re.sub("won't",'will not',TEXT)
            TEXT=re.sub("they're",'they are',TEXT)
            TEXT=re.sub("she's",'she is',TEXT)
            TEXT=re.sub("There's",'there is',TEXT)
            TEXT=re.sub("wouldn't",'would not',TEXT)
            TEXT=re.sub("haven't",'have not',TEXT)
            TEXT=re.sub("That's",'That is',TEXT)
            TEXT=re.sub("you've",'you have',TEXT)
            TEXT=re.sub("He's",'He is',TEXT)
            TEXT=re.sub("what's",'what is',TEXT)
            TEXT=re.sub("weren't",'were not',TEXT)
            TEXT=re.sub("we're",'we are',TEXT)
            TEXT=re.sub("hasn't",'has not',TEXT)
            TEXT=re.sub("you'd",'you would',TEXT)
            TEXT=re.sub("shouldn't",'should not',TEXT)
            TEXT=re.sub("let's",'let us',TEXT)
            TEXT=re.sub("they've",'they have',TEXT)
            TEXT=re.sub("You'll",'You will',TEXT)
            TEXT=re.sub("i'm",'i am',TEXT)
            TEXT=re.sub("we've",'we have',TEXT)
            TEXT=re.sub("it's",'it is',TEXT)
        
            TEXT=re.sub("don't",'do not',TEXT)
            TEXT=re.sub("that´s",'that is',TEXT)
            TEXT=re.sub("I´m",'I am',TEXT)
            TEXT=re.sub("it’s",'it is',TEXT)
            TEXT=re.sub("she´s",'she is',TEXT)
            TEXT=re.sub("he’s'",'he is',TEXT)
            TEXT=re.sub('I’m','I am',TEXT)
            TEXT=re.sub('I’d','I did',TEXT)
            TEXT=re.sub("he’s'",'he is',TEXT)
            TEXT=re.sub('there’s','there is',TEXT)
    
     
        return TEXT
       

    def remove_stop_words(self, text: str) -> str:
        return ' '.join([word for word in text.split() if word not in self.stop_words])

    def remove_freq_words(self, text: str, freq_words: Iterable[str] = None) -> str:
        if freq_words is None:
            freq_words = []  # Set a default value (empty list) for freq_words
        return ' '.join([word for word in text.split() if word not in freq_words])
    def lemmatize_text(self, text: str) -> str:
        return ' '.join([self.wordnet_lem.lemmatize(word) for word in text.split()])


In [3]:
class FeatureExtractor:
    def __init__(self):
        self.count_vectorizer = CountVectorizer(ngram_range=(1, 2), binary=False)
        self.tfidf_transformer = TfidfTransformer(use_idf=False)

    def fit_transform(self, X: pd.Series) -> pd.DataFrame:
        X_count = self.count_vectorizer.fit_transform(X)
        X_tfidf = self.tfidf_transformer.fit_transform(X_count)
        return X_tfidf

    def transform(self, X: pd.Series) -> pd.DataFrame:
        X_count = self.count_vectorizer.transform(X)
        X_tfidf = self.tfidf_transformer.transform(X_count)
        return X_tfidf

In [4]:
class SentimentClassifier:
    def __init__(self):
        self.clf = LogisticRegression(penalty='l2', solver='liblinear', multi_class='auto', C=100)

    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        self.clf.fit(X, y)

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        return self.clf.predict(X)

    def score(self, X: pd.DataFrame, y_true: pd.Series, scoring_func, average=None) -> float:
        y_pred = self.predict(X)
        return scoring_func(y_true, y_pred, average='macro')



In [6]:
def main():
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')

    preprocessor = TextPreprocessor()
    train_data = preprocessor.preprocess_dataframe(train_data, 'TEXT')
    test_data = preprocessor.preprocess_dataframe(test_data, 'TEXT')

    feature_extractor = FeatureExtractor()
    X = feature_extractor.fit_transform(train_data['TEXT'])
    y = train_data['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    sentiment_classifier = SentimentClassifier()
    sentiment_classifier.fit(X_train, y_train)

    f1 = sentiment_classifier.score(X_test, y_test, f1_score, average='macro')
    print(f"F1-score: {f1:.4f}")

    cm = confusion_matrix(y_test, sentiment_classifier.predict(X_test))
    print("Confusion Matrix:\n", cm)

    X_submission = feature_extractor.transform(test_data['TEXT'])
    predicted_labels = sentiment_classifier.predict(X_submission)
    test_data['LABEL'] = predicted_labels
    test_data = test_data.drop('TEXT', axis=1)
    test_data.to_csv('C:/Users/sweth/Downloads/submission.csv', index=False)

if __name__ == "__main__":
    main()


F1-score: 0.9164
Confusion Matrix:
 [[6383   46   25]
 [ 169 3379  308]
 [  90  375 3289]]
