### Step 1: Import Required Libraries


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

### Step 2: Load the Dataset

In [15]:
# Load the dataset
df = pd.read_csv('amazon_reviews.csv')
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [16]:
df.shape

(3150, 5)

### Step 3: Data Preprocessing

TfidfVectorizer (Term Frequency-Inverse Document Frequency) is used to convert text data into numerical features so that machine learning models can process it.

stop_words='english' → Removes common words like "the", "is", "and".  
max_features=5000 → Keeps only the top 5000 most important words.

In [17]:
# Keep only necessary columns and drop missing values
df = df[['verified_reviews', 'feedback']].dropna()

# Convert text reviews into numerical features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = vectorizer.fit_transform(df['verified_reviews'])

# Target variable
y = df['feedback']

### Step 4: Split the Data into Training & Testing Sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### Step 5: Train the Decision Tree Classifier

In [19]:
dtc = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc.fit(X_train, y_train)

### Step 6: Cross Validation on Training data

Cross Validation reduces variation in model performance by averaging results across multiple splits.

In [20]:
cv_scores = cross_val_score(dtc, X_train, y_train, cv=10)
print(f"Cross-validation Accuracy: {cv_scores.mean():.2f}")

Cross-validation Accuracy: 0.93


### Step 6: Make Predictions

In [21]:
y_pred = dtc.predict(X_test)

### Step 7: Checking Bias & Variance

In [22]:
# Train Accuracy (Bias Check)
train_accuracy = dtc.score(X_train, y_train)

# Test Accuracy (Variance Check)
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Train Accuracy: 0.9606986899563319
Test Accuracy: 0.9142857142857143


### Step 7: Evaluate the Model

In [23]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.35      0.45        63
           1       0.93      0.98      0.95       567

    accuracy                           0.91       630
   macro avg       0.78      0.66      0.70       630
weighted avg       0.90      0.91      0.90       630



In [24]:
# Test
new_reviews = ["This is the Bad product"]
new_reviews_tfidf = vectorizer.transform(new_reviews)

predictions = dtc.predict(new_reviews_tfidf)
print(predictions)


[1]


In [25]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(dtc, X_train, y_train, cv=skf, scoring='accuracy')
print("Cross-validation Accuracy:", scores.mean())

Cross-validation Accuracy: 0.930523998863959


In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
