In [2]:
pip install textblob


Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Collecting nltk>=3.8
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2024.7.24-cp38-cp38-win_amd64.whl (269 kB)
Installing collected packages: regex, nltk, textblob
  Attempting uninstall: regex
    Found existing installation: regex 2020.10.15
    Uninstalling regex-2020.10.15:
      Successfully uninstalled regex-2020.10.15
  Attempting uninstall: nltk
    Found existing installation: nltk 3.5
    Uninstalling nltk-3.5:
      Successfully uninstalled nltk-3.5
Successfully installed nltk-3.9.1 regex-2024.7.24 textblob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

# Load the dataset
file_path = 'amazon_review.csv'  # Using file path Locally
data = pd.read_csv(file_path)

# Ensure that the 'reviewText' column is in string format
data['reviewText'] = data['reviewText'].astype(str)

# Feature 1: Count of Exclamation Marks
data['exclamation_marks'] = data['reviewText'].apply(lambda x: x.count('!'))

# Feature 2: Count of Question Marks
data['question_marks'] = data['reviewText'].apply(lambda x: x.count('?'))

# Feature 3: Count of Capitalized Words
def count_capitalized_words(text):
    return sum(1 for word in text.split() if word.isupper() and len(word) > 1)

data['capitalized_words'] = data['reviewText'].apply(count_capitalized_words)

# Feature 4: Sentiment Polarity (using TextBlob)
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

data['sentiment_polarity'] = data['reviewText'].apply(get_sentiment)

# Feature 5: Sentiment Subjectivity (using TextBlob)
def get_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

data['sentiment_subjectivity'] = data['reviewText'].apply(get_subjectivity)

# Feature 6: Count of Punctuation (total punctuation marks)
def count_punctuation(text):
    return len([char for char in text if char in string.punctuation])

data['punctuation_count'] = data['reviewText'].apply(count_punctuation)

# Display the new features
print(data[['exclamation_marks', 'question_marks', 'capitalized_words', 'sentiment_polarity', 'sentiment_subjectivity', 'punctuation_count']].head())

# TF-IDF Feature Extraction (Combining with New Features)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(data['reviewText'])

# Convert the TF-IDF matrix into a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())  # Using get_feature_names() for older Scikit-learn versions

# Combine the new text features with the TF-IDF matrix
combined_features = pd.concat([tfidf_df, data[['exclamation_marks', 'question_marks', 'capitalized_words', 'sentiment_polarity', 'sentiment_subjectivity', 'punctuation_count']].reset_index(drop=True)], axis=1)

# Show combined features
print(combined_features.head())

# Now, you can use 'combined_features' as input for model training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming 'overall' is the target variable
X = combined_features
y = data['overall']

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


   exclamation_marks  question_marks  capitalized_words  sentiment_polarity  \
0                  0               0                  0            0.000000   
1                  0               0                  0            0.200000   
2                  0               0                  0            0.129167   
3                  2               0                  3            0.000000   
4                  0               0                  0            0.386667   

   sentiment_subjectivity  punctuation_count  
0                   0.000                  1  
1                   0.200                  4  
2                   0.525                  3  
3                   0.550                 15  
4                   0.360                  8  
    00  000   01  017   03  032g   04   06  064g   08  ...  zte  ztpad  zumo  \
0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...  0.0    0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...  0.0    0.0   0.0   
2  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
