In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your dataset (replace with your dataset)
data = pd.read_csv('data.csv')  # Ensure it has 'text' and 'label' columns

# Display dataset
print(data.head())

# Preprocess data: Clean the text and encode the labels
texts = data['comment'].values
labels = data['target'].values

# Encode labels to numerical values (if needed)
le = LabelEncoder()
labels = le.fit_transform(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


   index    id       entity    target  \
0      0  2401  Borderlands  Positive   
1      1  2401  Borderlands  Positive   
2      2  2401  Borderlands  Positive   
3      3  2401  Borderlands  Positive   
4      4  2401  Borderlands  Positive   

                                             comment  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [8]:
X_train

array(['Last Weekend league for Fifa 20 Glad I could finish strong . . Birthday Mbappe and TOTS de Bruyne were absolutely clutch .  pic.twitter.com/n1766sVgP9',
       "omg i'm so excited to watch dk play pubg",
       'all others who have problems with', ...,
       'Fuck this call of duty update..',
       'I should get up & feed my dogs & such that way when mac gets outta Dnd we can just do a watch together... I wanna play more borderlands but these cramps are bad today',
       'Welcome to The International!'], dtype=object)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Clean the text (remove stop words, punctuation, etc.)
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

X_train_clean = [clean_text(text) for text in X_train]
X_test_clean = [clean_text(text) for text in X_test]

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_clean)
X_test_tfidf = tfidf.transform(X_test_clean)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
y_test

array([1, 1, 1, ..., 0, 1, 2])

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      4380
           1       0.80      0.84      0.82      6301
           2       0.80      0.76      0.78      4119

    accuracy                           0.81     14800
   macro avg       0.81      0.80      0.80     14800
weighted avg       0.81      0.81      0.81     14800



In [5]:
import pickle

# Save the trained model
with open('decision_tree_sentiment_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)
