<a href="https://colab.research.google.com/github/Tanushreejaganathan/Sentiment_Analysis_tamil_tulu/blob/main/Tamil_preprocessed_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


# Custom stopwords derived from the dataset
custom_stopwords = {
    'intha', 'and', 'is', 'k', 'da', 'a', 'music', 'from', 'hit', 'm', 'movie', 'podunga', 'indha', 'இந்த',
    'ajith', 'mass', 'படம்', 'vera', 'ngk', 'வாழ்த்துக்கள்', 'here', 'ah', 'vijay', 'release', 'mattum', 'to',
    'waiting', 'rajini', 'enna', 'trailer', 'இது', 'star', 'ithu', 'views', 'thalaivar', 'dialogue', 'in',
    'திரௌபதி', 'iruku', 'thaan', 'dislike', 'petta', 'tha', 'ellam', 'fans', 'irukku', 'of', 'than', 'love',
    'likes', 'sema', 'சாதி', 'வெற்றி', 'ku', 'teaser', 'na', 'super', 'i', 'thala', 'bgm', 'பெற', 'marana',
    'nalla', 'thalaiva', 'mela', 'like', 'film', 'yuvan', 'pola', 'suriya', 'anna', 'padam', 'surya', 'am',
    'pa', 'la', 'neraya', 'best', 'oru', 'nu', 'semma', 'but', 'pakka', 'all', 'thalapathy', 'ya', 'panna',
    'va', 'you', 'level', 'u', 'சார்பாக', 'for', 'tamil', 'ஒரு', 'fan', 'illa', 'sir', 'the', 'தான்'
}

# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocessing function
def preprocess_text(text, stopwords):
    """Preprocess text by cleaning, tokenizing, removing stopwords, and stemming/lemmatization."""
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, str):
        # Lowercase conversion
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\u0B80-\u0BFF\s]', '', text)  # Tamil Unicode range

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove custom stopwords
        tokens = [word for word in tokens if word not in stopwords]

        # Remove short words
        tokens = [word for word in tokens if len(word) > 2]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

        # Stemming
        tokens = [stemmer.stem(word) for word in tokens]

        # Remove duplicate words
        tokens = list(dict.fromkeys(tokens))

        return ' '.join(tokens)
    return text

# Load datasets
train_path = '/content/drive/MyDrive/Tam-SA-train.csv'
val_path = '/content/drive/MyDrive/Tam-SA-val.csv'
test_path = '/content/drive/MyDrive/Tam-SA-test-without-labels.csv'

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Replace 'Text' column with preprocessed data
train_df['Text'] = train_df['Text'].apply(lambda x: preprocess_text(x, custom_stopwords))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocess_text(x, custom_stopwords))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocess_text(x, custom_stopwords))

# Save processed datasets
train_df.to_csv('/content/drive/MyDrive/traincl3.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/valcl3.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/testcl3.csv', index=False)

print("Preprocessing complete. Cleaned datasets saved.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Preprocessing complete. Cleaned datasets saved.
