In [11]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download(['stopwords', 'wordnet', 'punkt'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:


# 1. Load and clean data
df = pd.read_csv('Scraped_Car_Review_ferrari.csv', engine='python')
df = df.dropna(subset=['Review', 'Rating']).reset_index(drop=True)

# 2. Enhanced label balancing
def balanced_label(rating):
    if rating >= 4.25: return 'Positive'
    elif rating <= 3.0: return 'Negative'
    else: return 'Neutral'  # Wider neutral range

df['Sentiment'] = df['Rating'].apply(balanced_label)

# 3. Advanced text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    tokens = nltk.word_tokenize(text)
    return ' '.join([
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word.isalpha() 
        and word not in stop_words
        and len(word) > 2
    ])

df['Clean_Review'] = df['Review'].apply(preprocess)

# 4. TF-IDF Vectorization with n-grams
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X = tfidf.fit_transform(df['Clean_Review'])
y = df['Sentiment']

# 5. Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6. Use LinearSVC (better for small datasets)
model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)

# 7. Evaluation
print("Accuracy:", model.score(X_test, y_test))
print(classification_report(y_test, model.predict(X_test)))
# 8. Test prediction
test_review = "The transmission failed after 2000 miles. Worst car ever!"
clean_test = preprocess(test_review)
print("Prediction:", model.predict(tfidf.transform([clean_test]))[0])


Accuracy: 0.8888888888888888
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         1
     Neutral       0.00      0.00      0.00         2
    Positive       0.89      1.00      0.94        24

    accuracy                           0.89        27
   macro avg       0.30      0.33      0.31        27
weighted avg       0.79      0.89      0.84        27

Prediction: Positive


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Download NLTK resources (no wordnet needed)
nltk.download('stopwords')
nltk.download('punkt')

# Load and clean data
df = pd.read_csv('Scraped_Car_Review_ford.csv', engine='python')
df = df.dropna(subset=['Review', 'Rating']).reset_index(drop=True)

# Enhanced label balancing
def balanced_label(rating):
    if rating >= 4.25: return 'Positive'
    elif rating <= 3.0: return 'Negative'
    else: return 'Neutral'

df['Sentiment'] = df['Rating'].apply(balanced_label)

# Text preprocessing with stemming
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    tokens = nltk.word_tokenize(text)
    return ' '.join([
        stemmer.stem(word)
        for word in tokens
        if word.isalpha()
        and word not in stop_words
        and len(word) > 2
    ])

df['Clean_Review'] = df['Review'].apply(preprocess)

# TF-IDF Vectorization with n-grams
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X = tfidf.fit_transform(df['Clean_Review'])
y = df['Sentiment']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Use LinearSVC (better for small datasets)
model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)

# Evaluation
accuracy = model.score(X_test, y_test)
report = classification_report(y_test, model.predict(X_test), zero_division=0)

# Test prediction
test_review = "The transmission failed after 2000 miles. Worst car ever!"
#test_review = "Car gained 200 miles per hour speed in 50 seconds! what a great feeling"
clean_test = preprocess(test_review)
prediction = model.predict(tfidf.transform([clean_test]))

print("Accuracy:", accuracy)
print(report)
print(test_review)
print("Prediction for test review:", prediction[0])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.7092725409836066
              precision    recall  f1-score   support

    Negative       0.58      0.73      0.65       620
     Neutral       0.41      0.30      0.35       854
    Positive       0.83      0.85      0.84      2430

    accuracy                           0.71      3904
   macro avg       0.60      0.63      0.61      3904
weighted avg       0.70      0.71      0.70      3904

The transmission failed after 2000 miles. Worst car ever!
Prediction for test review: Negative


In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the IMDB dataset
# You can download this dataset from: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
df = pd.read_csv('IMDB Dataset.csv')
print(f"Dataset shape: {df.shape}")
print(df.head())

# Text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and apply stemming
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Apply preprocessing to the review column
print("Preprocessing text data...")
df['processed_review'] = df['review'].apply(preprocess_text)

# Convert sentiment to binary (1 for positive, 0 for negative)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data into training and testing sets
X = df['processed_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
print("Extracting features...")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Define models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(max_iter=10000),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Train and evaluate models
results = {}
print("\nTraining and evaluating models...")

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    # Print detailed classification report
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Compare models
print("\nModel Comparison:")
comparison_df = pd.DataFrame(results).T
print(comparison_df)

# Function to predict sentiment for new reviews
def predict_sentiment(review_text, model_name='SVM'):
    # Preprocess the new review
    processed_review = preprocess_text(review_text)
    # Transform to TF-IDF features
    review_tfidf = tfidf.transform([processed_review])
    # Predict sentiment
    prediction = models[model_name].predict(review_tfidf)[0]
    sentiment = 'Positive' if prediction == 1 else 'Negative'
    return sentiment

# Example usage
print("\nPredicting sentiment for new reviews:")
test_reviews = [
    "This movie was fantastic! The acting was superb and the plot was engaging.",
    "Terrible movie. Bad acting, worse script. Complete waste of time.",
    "The movie had some good moments, but overall it was disappointing."
]

for review in test_reviews:
    nb_sentiment = predict_sentiment(review, 'Naive Bayes')
    svm_sentiment = predict_sentiment(review, 'SVM')
    lr_sentiment = predict_sentiment(review, 'Logistic Regression')
    
    print(f"\nReview: {review}")
    print(f"Naive Bayes: {nb_sentiment}")
    print(f"SVM: {svm_sentiment}")
    print(f"Logistic Regression: {lr_sentiment}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset shape: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Preprocessing text data...
Extracting features...

Training and evaluating models...

Training Naive Bayes...

Naive Bayes Performance:
Accuracy: 0.8590
Precision: 0.8485
Recall: 0.8768
F1 Score: 0.8624

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      4961
           1       0.85      0.88      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


Training SVM...

SVM Performance:
Accurac