<a href="https://colab.research.google.com/github/Tanushreejaganathan/Sentiment_Analysis_tamil_tulu/blob/main/Tulu_Preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Preprocessing techniques
import pandas as pd
import re
from collections import Counter

# Load your datasets
train_path = '/content/drive/MyDrive/Tulu_SA_train.csv'
val_path = '/content/drive/MyDrive/Tulu_SA_val.csv'
test_path = '/content/drive/MyDrive/Tulu_SA_test_without_label.csv'


train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Combine all text data for analysis
combined_text = pd.concat([train_df['Text'], val_df['Text'], test_df['Text']], ignore_index=True)

# Function to clean and tokenize text
def clean_and_tokenize(text):
    if isinstance(text, str):
        # Lowercase and remove non-Tulu (non-Kannada script) characters
        text = re.sub(r'[^a-zA-Z\u0C80-\u0CFF\s]', '', text.lower())
        return text.split()  # Split into words
    return []

# Tokenize all text data
tokens = []
for text in combined_text.dropna():
    tokens.extend(clean_and_tokenize(text))

# Count word frequencies
word_freq = Counter(tokens)

# Determine stopwords based on frequency
# Adjust the threshold as needed (e.g., top 100 words)
stopwords = [word for word, freq in word_freq.most_common(100)]

# Print or save the stopwords
print("Identified Stopwords in Tulu Language:")
print(stopwords)

# Optionally, save stopwords to a file
with open('tulu_stopwords.txt', 'w', encoding='utf-8') as f:
    for word in stopwords:
        f.write(f"{word}\n")


Identified Stopwords in Tulu Language:
['super', 'comedy', 'pukuli', 'the', 'song', 'tulu', 'undu', 'is', 'i', 'bro', 'and', 'sir', 'da', 'movie', 'anna', 'best', 'in', 'of', 'full', 'team', 'you', 'all', 'this', 'video', 'good', 'nice', 'love', 'na', 'd', 'like', 'to', 'masth', 'marre', 'koointulu', 'a', 'please', 'acting', 'malpule', 'very', 'u', 'santhu', 'maya', 'for', 'yedde', 'edde', 'it', 'onji', 'mare', 'upload', 'kudla', 'from', 'tulubaravu', 'jaitulunad', 'ಸೂಪರ್', 'superb', 'namma', 'tuluscript', 'ತುಲು', 'tulunad', 'padle', 'my', 'la', 'tululipi', 'jai', 'voice', 'ತುಳು', 'dada', 'plz', 'e', 'bari', 'mast', 'baari', 'n', 'g', 'ye', 'rai', 'next', 'ajji', 'nataka', 'film', 'tulubarahu', 'porlu', 'wow', 'are', 'one', 'your', 'bale', 'so', 'arpith', 'act', 'but', 'pls', 'korle', 'we', 'yan', 'santu', 'erna', 'up', 'spr', 'show']


Preprocessing technique for tulu language

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

nltk.download('punkt_tab')

# Custom stopwords derived from the dataset
tulu_stopwords = {
    'super', 'comedy', 'pukuli', 'the', 'song', 'tulu', 'undu', 'is', 'i', 'bro', 'and', 'sir',
    'da', 'movie', 'anna', 'best', 'in', 'of', 'full', 'team', 'you', 'all', 'this', 'video',
    'good', 'nice', 'love', 'na', 'd', 'like', 'to', 'masth', 'marre', 'koointulu', 'a',
    'please', 'acting', 'malpule', 'very', 'u', 'santhu', 'maya', 'for', 'yedde', 'edde', 'it',
    'onji', 'mare', 'upload', 'kudla', 'from', 'tulubaravu', 'jaitulunad', 'ಸೂಪರ್', 'superb',
    'namma', 'tuluscript', 'ತುಲು', 'tulunad', 'padle', 'my', 'la', 'tululipi', 'jai', 'voice',
    'ತುಳು', 'dada', 'plz', 'e', 'bari', 'mast', 'baari', 'n', 'g', 'ye', 'rai', 'next', 'ajji',
    'nataka', 'film', 'tulubarahu', 'porlu', 'wow', 'are', 'one', 'your', 'bale', 'so', 'arpith',
    'act', 'but', 'pls', 'korle', 'we', 'yan', 'santu', 'erna', 'up', 'spr', 'show'
}

# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocessing function
def preprocess_text(text, stopwords):
    """Preprocess text by cleaning, tokenizing, removing stopwords, and stemming/lemmatization."""
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, str):
        # Lowercase conversion
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove special characters and numbers (includes Kannada Unicode range)
        text = re.sub(r'[^a-zA-Z\u0C80-\u0CFF\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove custom stopwords
        tokens = [word for word in tokens if word not in stopwords]

        # Remove short words
        tokens = [word for word in tokens if len(word) > 2]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

        # Stemming
        tokens = [stemmer.stem(word) for word in tokens]

        # Remove duplicate words
        tokens = list(dict.fromkeys(tokens))

        return ' '.join(tokens)
    return text

train_path = '/content/drive/MyDrive/Tulu_SA_train.csv'
val_path = '/content/drive/MyDrive/Tulu_SA_val.csv'
test_path = '/content/drive/MyDrive/Tulu_SA_test_without_label.csv'

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Replace 'Text' column with preprocessed data
train_df['Text'] = train_df['Text'].apply(lambda x: preprocess_text(x, tulu_stopwords))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocess_text(x, tulu_stopwords))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocess_text(x, tulu_stopwords))

# Save processed datasets
train_df.to_csv('/content/drive/MyDrive/Tulutraincl1.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/Tuluvalcl1.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/Tulutestcl1.csv', index=False)

print("Preprocessing complete. Cleaned datasets saved.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing complete. Cleaned datasets saved.
