In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = {
    "text": [
        "It's amazing",
        "This was just so, so bad",
        "Amazing quality and great service",
        "I hate it so much, worst experience ever",
        "I will buy again, it was brilliant and well made",
        "Terrible, so disappointing and a waste of money",
        "Absolutely fantastic, exceeded all expectations",
        "Horrible experience, I would never recommend",
        "The best purchase I have made in years",
        "Awful, nothing like what was advertised",
        "Loved it! Perfect in every way",
        "Superb quality, exactly what I wanted",
        "Highly recommended, wonderful experience",
        "Everything was flawless, five stars",
        "Great product, very happy with my purchase",
        "I regret buying this, completely useless",
        "Worst decision ever, I feel cheated",
        "Total disaster, I wish I could give zero stars",
        "Not worth the money, extremely disappointed",
        "Terrible service, I had the worst time dealing with this"
    ],
    "label": [
        1, 0, 1, 0, 1,# 1 = positive 0 = negative
        0, 1, 0, 1, 0,
        1, 1, 1, 1, 1,# added more because too small
        0, 0, 0, 0, 0
    ]
}

In [3]:
df = pd.DataFrame(data)

In [22]:
xtr, xts, ytr, yts = train_test_split(df.text, df.label, test_size=0.2, random_state=42)

In [5]:
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(xtr)
x_test_tfidf = vectorizer.transform(xts)

In [6]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['absolutely', 'advertised', 'again', 'all', 'amazing', 'and',
       'awful', 'best', 'brilliant', 'buy', 'cheated', 'dealing',
       'decision', 'disappointed', 'disappointing', 'ever', 'every',
       'everything', 'exactly', 'exceeded', 'expectations', 'experience',
       'extremely', 'fantastic', 'feel', 'five', 'flawless', 'great',
       'had', 'happy', 'hate', 'have', 'highly', 'horrible', 'in', 'it',
       'like', 'loved', 'made', 'money', 'much', 'my', 'never', 'not',
       'nothing', 'of', 'perfect', 'product', 'purchase', 'quality',
       'recommend', 'recommended', 'service', 'so', 'stars', 'superb',
       'terrible', 'the', 'this', 'time', 'very', 'wanted', 'was',
       'waste', 'way', 'well', 'what', 'will', 'with', 'wonderful',
       'worst', 'worth', 'would', 'years'], dtype=object)

In [8]:
x_train_tfidf.shape, x_test_tfidf.shape

((16, 74), (4, 74))

In [9]:
x_test_tfidf[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 74)>

In [10]:
pd.DataFrame(x_train_tfidf.toarray(), columns = feature_names)

Unnamed: 0,absolutely,advertised,again,all,amazing,and,awful,best,brilliant,buy,...,way,well,what,will,with,wonderful,worst,worth,would,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.412305,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.412305
1,0.0,0.0,0.0,0.0,0.0,0.321292,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.409768,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.337263,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.431706,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.372954,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.507509,0.39548,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.431706,0.0,0.0,0.0,0.0,0.431706,0.0,0.0,0.0,...,0.0,0.0,0.375962,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.318216,0.0,0.284739,0.0,0.0,0.0


In [12]:
param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'logisticregression__penalty': ['l1', 'l2'], # Type of regularization
    'logisticregression__solver': ['liblinear'] # Some solvers work only with specific penalties
}

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
grid = GridSearchCV(make_pipeline(LogisticRegression(n_jobs=-1)), param_grid=param_grid, cv=5)

In [16]:
grid.fit(x_train_tfidf, ytr)



0,1,2
,estimator,Pipeline(step...(n_jobs=-1))])
,param_grid,"{'logisticregression__C': [0.01, 0.1, ...], 'logisticregression__penalty': ['l1', 'l2'], 'logisticregression__solver': ['liblinear']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1000
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [18]:
best_C = grid.best_params_['logisticregression__C']
best_penalty = grid.best_params_['logisticregression__penalty']
best_solver = grid.best_params_['logisticregression__solver']

print("Best C:", best_C)
print("Best penalty:", best_penalty)
print("Best solver:", best_solver)

Best C: 1000
Best penalty: l2
Best solver: liblinear


In [19]:
best_model = grid.best_estimator_

In [21]:
y_pred = best_model.predict(x_test_tfidf)


In [23]:
accuracy_score(y_pred, yts)

0.75

In [24]:
# Get probability estimates
probs = best_model.predict_proba(x_test_tfidf)

# Print results
for text, prob, pred in zip(xts, probs, y_pred):
    print(f"Text: {text}")
    # Probability of being '1' (positive)
    print(f"Predicted Probability: {prob[1]:.4f}")
    print(f"Predicted Sentiment: {'Positive' if pred == 1 else 'Negative'}\n")


Text: It's amazing
Predicted Probability: 0.9609
Predicted Sentiment: Positive

Text: Total disaster, I wish I could give zero stars
Predicted Probability: 0.9623
Predicted Sentiment: Positive

Text: I regret buying this, completely useless
Predicted Probability: 0.2641
Predicted Sentiment: Negative

Text: This was just so, so bad
Predicted Probability: 0.0609
Predicted Sentiment: Negative

