In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/product-sentiment-classification/Participants_Data/Sample Submission.csv
/kaggle/input/product-sentiment-classification/Participants_Data/Train.csv
/kaggle/input/product-sentiment-classification/Participants_Data/Test.csv


## Importing libraries


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

### Load the data

In [3]:

train_data = pd.read_csv("/kaggle/input/product-sentiment-classification/Participants_Data/Train.csv")
test_data = pd.read_csv("/kaggle/input/product-sentiment-classification/Participants_Data/Test.csv")

print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)

Training Data Shape: (6364, 4)
Testing Data Shape: (2728, 3)


### Inspect the data

In [4]:

train_data.head()

Unnamed: 0,Text_ID,Product_Description,Product_Type,Sentiment
0,3057,The Web DesignerÛªs Guide to iOS (and Android...,9,2
1,6254,RT @mention Line for iPad 2 is longer today th...,9,2
2,8212,Crazy that Apple is opening a temporary store ...,9,2
3,4422,The lesson from Google One Pass: In this digit...,9,2
4,5526,RT @mention At the panel: &quot;Your mom has a...,9,2


### Check the distribution of sentiment values

In [5]:

print("Sentiment value counts:\n", train_data['Sentiment'].value_counts())


Sentiment value counts:
 Sentiment
2    3765
3    2089
1     399
0     111
Name: count, dtype: int64


### Remove invalid sentiment rows

In [6]:
# Remove invalid sentiment rows (if necessary, e.g., if '0' is not a valid sentiment)
train_data = train_data[train_data['Sentiment'] != '0']


### Encode the sentiment labels (target)

In [7]:

label_encoder = LabelEncoder()
train_data['Sentiment'] = label_encoder.fit_transform(train_data['Sentiment'])


### Define the features (Product_Description) and target (Sentiment)


In [8]:
X = train_data['Product_Description']
y = train_data['Sentiment']

### TF-IDF Vectorization

In [9]:

vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features
X_tfidf = vectorizer.fit_transform(X)

### Split the data into training and testing sets

In [10]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


### Random Forest Classifier


In [11]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_tfidf, y_train)

### Predict and evaluate using Random Forest


In [12]:
rfc_predictions = rfc.predict(X_test_tfidf)

### Metrics for Random Forest


In [13]:

rfc_f1 = f1_score(y_test, rfc_predictions, average='macro')
rfc_precision = precision_score(y_test, rfc_predictions, average='macro')
rfc_recall = recall_score(y_test, rfc_predictions, average='macro')
rfc_accuracy = accuracy_score(y_test, rfc_predictions)
print("Random Forest Metrics:")
print(f"F1 Score (Macro): {rfc_f1}")
print(f"Precision (Macro): {rfc_precision}")
print(f"Recall (Macro): {rfc_recall}")
print(f"Accuracy: {rfc_accuracy}")


Random Forest Metrics:
F1 Score (Macro): 0.3959842391634943
Precision (Macro): 0.5106653964034826
Recall (Macro): 0.3726040226794744
Accuracy: 0.6739984289080911


### Logistic Regression Classifier

In [14]:
logicreg = LogisticRegression(max_iter=200, random_state=42)  # max_iter increased for convergence
logicreg.fit(X_train_tfidf, y_train)

### Predict and evaluate using Logistic Regression


In [15]:
logicreg_predictions = logicreg.predict(X_test_tfidf)

### Metrics for Logistic Regression


In [16]:
logicreg_f1 = f1_score(y_test, logicreg_predictions, average='macro')
logicreg_precision = precision_score(y_test, logicreg_predictions, average='macro')
logicreg_recall = recall_score(y_test, logicreg_predictions, average='macro')
logicreg_accuracy = accuracy_score(y_test, logicreg_predictions)
print("\nLogistic Regression Metrics:")
print(f"F1 Score (Macro): {logicreg_f1}")
print(f"Precision (Macro): {logicreg_precision}")
print(f"Recall (Macro): {logicreg_recall}")
print(f"Accuracy: {logicreg_accuracy}")


Logistic Regression Metrics:
F1 Score (Macro): 0.3375887220531571
Precision (Macro): 0.43097888205028356
Recall (Macro): 0.3403860695405171
Accuracy: 0.6787117046347211


  _warn_prf(average, modifier, msg_start, len(result))


### Predict on test data

In [17]:

if 'Product_Description' in test_data.columns:
    X_test_final_tfidf = vectorizer.transform(test_data['Product_Description'])
    test_predictions = rfc.predict(X_test_final_tfidf)
    # Convert predictions back to original labels
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    
    # Save the predictions to a CSV file
    test_data['Predicted_Sentiment'] = test_predictions_labels
    test_data[['Product_Description', 'Predicted_Sentiment']].to_csv('test_predictions.csv', index=False)
    print("Test predictions saved to 'test_predictions.csv'")

Test predictions saved to 'test_predictions.csv'


In [18]:
test_predictions = pd.read_csv("test_predictions.csv")

In [19]:
test_predictions.head()

Unnamed: 0,Product_Description,Predicted_Sentiment
0,RT @mention Going to #SXSW? The new iPhone gui...,3
1,RT @mention 95% of iPhone and Droid apps have ...,1
2,RT @mention Thank you to @mention for letting ...,2
3,#Thanks @mention we're lovin' the @mention app...,3
4,At #sxsw? @mention / @mention wanna buy you a ...,2


### Observation 
- Best classification model is random forest