In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1


In [None]:
import re
import pandas as pd
import stanza
import nltk
from sklearn.utils import resample

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

stanza.download('ta')  # Tamil model
nlp = stanza.Pipeline('ta', processors='tokenize')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package |
-----------------------
| tokenize  | ttb     |
| mwt       | ttb     |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Done loading processors!


In [None]:
file_path = '/content/drive/MyDrive/political_dataset/PS_train.csv'
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,content,labels
0,தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் ...,Neutral
1,அண்ணன் இதனை சூசகமாக 11 மாதங்கள் முன்பே பேட்டிய...,Substantiated
2,ஒரு வருடம் ஆகி விட்டது இந்த துயரம் நேர்ந்து......,Opinionated
3,"எடப்பாடியை கண்டுகொள்ளாத ""எடப்பாடி""🫢\n ---\nஆதர...",Positive
4,எங்களின் அரசியல் அடுத்த தலைமுறைக்குமானது \n#மக...,Opinionated


In [None]:
data.isnull().sum()

Unnamed: 0,0
content,0
labels,0


In [None]:
label_count=data['labels'].value_counts()
label_count

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
Opinionated,1361
Sarcastic,790
Neutral,637
Positive,575
Substantiated,412
Negative,406
None of the above,171


In [None]:
def preprocess_tamil_text(text):
    # Step 1: Normalize Unicode
    text = re.sub(r'\u200c', '', text)  # Remove Zero-Width Non-Joiner (ZWNJ) if present

    # Step 2: Remove non-Tamil characters, special characters, and numbers
    text = re.sub(r'[^஀-௿\s]', '', text)  # Retain only Tamil script and spaces
    text = re.sub(r'\d+', '', text)  # Remove numeric values

    # Step 3: Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Handle commonly used spoken variants (expanded replacements)
    replacements = {
        "ஏ": "எ",  # Normalize vowels
        "ஓ": "ஒ",
        "கௌ": "கோ",  # Normalize common diphthongs
        "சௌ": "சோ",
        "பௌ": "போ",
        "கெ": "கே",
        "செ": "சே",
        "டெ": "டே",
        "தெ": "தே",
        "நெ": "நே",
        "பெ": "பே",
        "மெ": "மே",
        "வெ": "வே",
        "லெ": "லே",
        "றெ": "றே",
        "னெ": "னே",
        "இ": "எ",  # Normalize short vowels
        "உ": "ஒ",
        "க்ஷ": "க",  # Normalize compound consonants
        "ஜ": "ச"
    }
    for key, value in replacements.items():
        text = text.replace(key, value)

    return text

In [None]:
def tokenize_text(text):
    doc = nlp(text)  # Tokenize the text using Stanza
    return ' '.join([word.text for word in doc.sentences[0].words])

In [None]:
data['content'] = data['content'].apply(preprocess_tamil_text)

In [None]:
data['content'] = data['content'].apply(tokenize_text)

In [None]:
data.head()

Unnamed: 0,content,labels
0,தேன்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் ...,Neutral
1,அண்ணன் எதனை சூசகமாக மாதங்கள் முன்பே பேட்டியில்...,Substantiated
2,ஒரு வருடம் ஆகி விட்டது எந்த துயரம் நேர்ந்து என...,Opinionated
3,எடப்பாடியை கண்டுகொள்ளாத எடப்பாடி ஆதரிப்பீர் ஒத...,Positive
4,எங்களின் அரசியல் அடுத்த தலைமுறைக்க்கும் ஆனது ம...,Opinionated


In [None]:
class_counts = data['labels'].value_counts()
max_class_size = class_counts.max()
upsampled_data = []

for label in class_counts.index:
    class_data = data[data['labels'] == label]

    # Upsample to the maximum class size
    upsampled_class_data = resample(class_data,
                                    replace=True,  # Sample with replacement
                                    n_samples=max_class_size,  # Match the size of the largest class
                                    random_state=42)

    upsampled_data.append(upsampled_class_data)

balanced_data = pd.concat(upsampled_data)

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
null_values = balanced_data.isnull().sum()

In [None]:
balanced_data.isnull().sum()

Unnamed: 0,0
content,0
labels,0


In [None]:
balanced_data.head()

Unnamed: 0,content,labels
0,எந்த ரூ அதுவும் தேர்தல் நேரத்தில்பேண்களுக்கு ச...,Opinionated
1,தஞ்சாவூர் நாடாளுமன்றம் தொகுதி வேட்பாளர் திருசம...,Substantiated
2,சாபர் சாதிக் வழக்கில் க்கு ஒள்நோக்கம் எருக்குத...,Negative
3,நல்லா பண்ணி எருக்கிங்க,None of the above
4,தேருக்கோடியில் நின்றாலும் தனியாக தான் நிற்பேன்,Positive


In [None]:
label_counts=balanced_data["labels"].value_counts()
label_counts

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
Opinionated,1361
Substantiated,1361
Negative,1361
None of the above,1361
Positive,1361
Neutral,1361
Sarcastic,1361


In [None]:
test_data=pd.read_csv('/content/drive/MyDrive/political_dataset/PS_test_without_lables.csv')
test_data.head()

Unnamed: 0,Id,content
0,PS_01,இஸ்லாமிய சகோதரர்களுடன் ரமலான் கொண்டாடிய அதிமுக...
1,PS_02,\nஓபிஎஸ் - எடப்பாடி போட்டா போட்டி! திடீரென பணி...
2,PS_03,இன்றைய பரப்புரை:\n\nநாம் தமிழர் கட்சி தலைமை ஒர...
3,PS_04,"🇰🇬🎙️ இன்னும் 05 ஏ நாளில், வெல்வோம் தமிழராய் - ..."
4,PS_05,டாஸ்மாக்ல மட்டும்தான் கருணாநிதி அவர்களின் பெய...


In [None]:
test_data['content'] =test_data['content'].apply(preprocess_tamil_text)

In [None]:
test_data['content'] =test_data['content'].apply(tokenize_text)

In [None]:
test_data.head()

Unnamed: 0,Id,content
0,PS_01,எஸ்லாமிய சகோதர்க் உர்களன் ரமலான் கொண்டாடிய அதி...
1,PS_02,ஒபிஎஸ் எடப்பாடி போட்டா போட்டி திடீரென பணிகளை ம...
2,PS_03,என்றைய பரப்புரை நாம் தமிழர் கட்சி தலைமை ஒருங்க...
3,PS_04,என்னும் எ நாளில் வேல்வோம் தமிழராய் தலை நிமிர்வ...
4,PS_05,டாஸ்மாக்ல மட்டும்தான் கருணாநிதி அவர்களின் பேயர...


In [None]:
balanced_data.to_csv('/content/preprocessed_PS_train.csv', index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
data = pd.read_csv('/content/preprocessed_PS_train.csv')
data

Unnamed: 0,content,labels
0,எந்த ரூ அதுவும் தேர்தல் நேரத்தில்பேண்களுக்கு ச...,Opinionated
1,தஞ்சாவூர் நாடாளுமன்றம் தொகுதி வேட்பாளர் திருசம...,Substantiated
2,சாபர் சாதிக் வழக்கில் க்கு ஒள்நோக்கம் எருக்குத...,Negative
3,நல்லா பண்ணி எருக்கிங்க,None of the above
4,தேருக்கோடியில் நின்றாலும் தனியாக தான் நிற்பேன்,Positive
...,...,...
9522,தமிழ்நாடு அரசு கோரிய டிஎம்சி நீரை தர முடியாது ...,Substantiated
9523,நடிகர் விசய் சீமானுக்கு ஆதரவு தேரித்தார் என சொ...,Positive
9524,தம்பி திருப்பூர் சுடலையின் சிறப்பான பேச்சு,Positive
9525,ஆந்திர முதல்வர் மீது கல்வீச்சு முகஸ்டாலின் கண்...,Opinionated


In [None]:
# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

In [None]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = rf_model.predict(X_train_vectorized)
y_pred_test = rf_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

Train Data Evaluation:
Accuracy: 0.9493504789397716
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.93      0.97      0.95      1089
          Neutral       0.93      0.92      0.93      1089
None of the above       0.94      0.99      0.96      1089
      Opinionated       0.99      0.93      0.96      1088
         Positive       0.94      0.95      0.95      1089
        Sarcastic       0.98      0.95      0.96      1089
    Substantiated       0.93      0.95      0.94      1088

         accuracy                           0.95      7621
        macro avg       0.95      0.95      0.95      7621
     weighted avg       0.95      0.95      0.95      7621

Confusion Matrix:
 [[1052    3    8    0    5    0   21]
 [  12 1001   18    5   22    6   25]
 [   0    8 1073    0    0    4    4]
 [  14   30   12 1011   15    2    4]
 [  17   11   12    3 1034    3    9]
 [  20    9   10    3    7 1030   10]
 [  13   13    9    0   1

In [None]:
import pandas as pd

# Load test dataset
test_data = pd.read_csv("/content/preprocessed_PS_test_without_lables.csv")  # Replace with your actual test dataset file path

# Preprocess test data (if needed, skip this if already preprocessed)
# test_data['content'] = test_data['content'].apply(preprocess_tamil_text)

# Vectorize the test dataset
X_test_vectorized = vectorizer.transform(test_data['content'])

# Make predictions
predictions = rf_model.predict(X_test_vectorized)

# Add predictions to the test dataset
test_data['Predicted_Labels'] = predictions

# Save the results to a CSV file
output_file = "test_predictions.csv"
test_data.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to test_predictions.csv


In [None]:
test_pred_data = pd.read_csv("/content/test_predictions.csv")
test_pred_data.head()

Unnamed: 0,Id,content,Predicted_Labels
0,PS_01,எஸ்லாமிய சகோதர்க் உர்களன் ரமலான் கொண்டாடிய அதி...,Sarcastic
1,PS_02,ஒபிஎஸ் எடப்பாடி போட்டா போட்டி திடீரென பணிகளை ம...,Sarcastic
2,PS_03,என்றைய பரப்புரை நாம் தமிழர் கட்சி தலைமை ஒருங்க...,Opinionated
3,PS_04,என்னும் எ நாளில் வேல்வோம் தமிழராய் தலை நிமிர்வ...,Substantiated
4,PS_05,டாஸ்மாக்ல மட்டும்தான் கருணாநிதி அவர்களின் பேயர...,Positive


##Logistic Regression


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Logistic Regression model
logistic_model = LogisticRegression(
    random_state=42,
    max_iter=500,
    class_weight='balanced'
)
logistic_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = logistic_model.predict(X_train_vectorized)
y_pred_test = logistic_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


Train Data Evaluation:
Accuracy: 0.6249835979530245
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.62      0.65      0.63      1089
          Neutral       0.59      0.53      0.56      1089
None of the above       0.77      0.98      0.86      1089
      Opinionated       0.58      0.50      0.53      1088
         Positive       0.57      0.60      0.58      1089
        Sarcastic       0.61      0.51      0.56      1089
    Substantiated       0.59      0.61      0.60      1088

         accuracy                           0.62      7621
        macro avg       0.62      0.62      0.62      7621
     weighted avg       0.62      0.62      0.62      7621

Confusion Matrix:
 [[ 712   67   35   47  101   34   93]
 [  83  574   74   81  126   61   90]
 [   0    0 1064    6    8    3    8]
 [ 102   85   55  541   98  126   81]
 [  80   80   30   91  651   68   89]
 [ 105   72   76   98   83  555  100]
 [  75   95   45   71   7

##Naive Bayes


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = nb_model.predict(X_train_vectorized)
y_pred_test = nb_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


Train Data Evaluation:
Accuracy: 0.5752525915234221
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.52      0.58      0.55      1089
          Neutral       0.59      0.50      0.54      1089
None of the above       0.81      0.88      0.84      1089
      Opinionated       0.57      0.43      0.49      1088
         Positive       0.48      0.60      0.53      1089
        Sarcastic       0.56      0.46      0.51      1089
    Substantiated       0.51      0.57      0.54      1088

         accuracy                           0.58      7621
        macro avg       0.58      0.58      0.57      7621
     weighted avg       0.58      0.58      0.57      7621

Confusion Matrix:
 [[633  56  24  49 155  58 114]
 [102 546  44  52 142  68 135]
 [ 59  10 956  13  23  15  13]
 [125  85  36 470 143 117 112]
 [ 86  74  30  87 653  61  98]
 [123  87  53  91 115 503 117]
 [ 94  71  38  65 125  72 623]]

Test Data Evaluation:
Accuracy: 0.

##Decision tree

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = dt_model.predict(X_train_vectorized)
y_pred_test = dt_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


Train Data Evaluation:
Accuracy: 0.9493504789397716
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.92      0.98      0.95      1089
          Neutral       0.92      0.93      0.93      1089
None of the above       0.94      0.99      0.96      1089
      Opinionated       0.99      0.93      0.96      1088
         Positive       0.95      0.94      0.95      1089
        Sarcastic       0.99      0.94      0.96      1089
    Substantiated       0.95      0.93      0.94      1088

         accuracy                           0.95      7621
        macro avg       0.95      0.95      0.95      7621
     weighted avg       0.95      0.95      0.95      7621

Confusion Matrix:
 [[1062    3    8    0    5    0   11]
 [  16 1016   18    1   17    3   18]
 [   0    8 1077    0    0    0    4]
 [  14   36   12 1014    9    0    3]
 [  17   16   12    7 1026    3    8]
 [  20   12   14    5    7 1023    8]
 [  23   16    9    1   1

##ADA boost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize AdaBoost with hyperparameter tuning
adaboost_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=2),  # Slightly deeper tree
    n_estimators=200,  # More estimators
    learning_rate=0.5,  # Lower learning rate
    random_state=42
)

# Fit the model
adaboost_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = adaboost_model.predict(X_train_vectorized)
y_pred_test = adaboost_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


Train Data Evaluation:
Accuracy: 0.288544810392337
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.27      0.21      0.24      1089
          Neutral       0.22      0.11      0.15      1089
None of the above       0.38      0.85      0.53      1089
      Opinionated       0.24      0.04      0.07      1088
         Positive       0.24      0.35      0.28      1089
        Sarcastic       0.22      0.25      0.23      1089
    Substantiated       0.31      0.20      0.24      1088

         accuracy                           0.29      7621
        macro avg       0.27      0.29      0.25      7621
     weighted avg       0.27      0.29      0.25      7621

Confusion Matrix:
 [[233 107 262  33 248 127  79]
 [142 122 240  13 267 212  93]
 [  6   5 921  10  31 102  14]
 [137  89 269  48 243 217  85]
 [ 81 102 169  30 384 197 126]
 [120  52 322  37 203 273  82]
 [130  87 232  28 254 139 218]]

Test Data Evaluation:
Accuracy: 0.2

##Gradiant Boost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=100,  # Number of boosting stages to be used
    learning_rate=0.1,  # How much each tree contributes to the final model
    max_depth=3,  # Max depth of individual trees
    random_state=42
)

# Fit the model
gb_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = gb_model.predict(X_train_vectorized)
y_pred_test = gb_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


Train Data Evaluation:
Accuracy: 0.687836241962997
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.70      0.67      0.68      1089
          Neutral       0.68      0.57      0.62      1089
None of the above       0.69      0.98      0.81      1089
      Opinionated       0.71      0.58      0.64      1088
         Positive       0.63      0.70      0.66      1089
        Sarcastic       0.78      0.60      0.68      1089
    Substantiated       0.66      0.72      0.69      1088

         accuracy                           0.69      7621
        macro avg       0.69      0.69      0.68      7621
     weighted avg       0.69      0.69      0.68      7621

Confusion Matrix:
 [[ 727   62   62   42   88   39   69]
 [  59  626  104   60  103   43   94]
 [   3    0 1065    8    0    0   13]
 [  72   90   91  632   89   44   70]
 [  49   45   68   59  761   33   74]
 [  70   58   90   62   82  652   75]
 [  58   44   60   27   91

##CAT Boost

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load preprocessed dataset
data = pd.read_csv("/content/preprocessed_PS_train.csv")  # Replace with your actual file path

# Split dataset into features (X) and labels (y)
X = data['content']
y = data['labels']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize CatBoost model
catboost_model = CatBoostClassifier(
    iterations=500,  # Number of trees
    learning_rate=0.1,  # Step size
    depth=6,  # Maximum depth of trees
    cat_features=[],  # No categorical features since we're working with text
    random_seed=42,
    verbose=100  # Display training progress
)

# Fit the model
catboost_model.fit(X_train_vectorized, y_train)

# Predictions
y_pred_train = catboost_model.predict(X_train_vectorized)
y_pred_test = catboost_model.predict(X_test_vectorized)

# Evaluation
print("Train Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


0:	learn: 1.9279733	total: 284ms	remaining: 2m 21s
100:	learn: 1.5783618	total: 42.8s	remaining: 2m 49s
200:	learn: 1.4322227	total: 1m 2s	remaining: 1m 32s
300:	learn: 1.3400271	total: 1m 20s	remaining: 52.9s
400:	learn: 1.2576935	total: 1m 37s	remaining: 24.1s
499:	learn: 1.2015192	total: 1m 55s	remaining: 0us
Train Data Evaluation:
Accuracy: 0.7062065345755151
Classification Report:
                    precision    recall  f1-score   support

         Negative       0.74      0.73      0.73      1089
          Neutral       0.67      0.62      0.65      1089
None of the above       0.67      0.98      0.80      1089
      Opinionated       0.72      0.59      0.65      1088
         Positive       0.69      0.71      0.70      1089
        Sarcastic       0.73      0.60      0.66      1089
    Substantiated       0.74      0.72      0.73      1088

         accuracy                           0.71      7621
        macro avg       0.71      0.71      0.70      7621
     weighted avg 