In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import warnings
warnings.filterwarnings("ignore")
import textwrap


In [None]:
!pip install tiktoken accelerate transformers_stream_generator einops optimum  auto-gptq




In [None]:
# Note: The default behavior now has injection attack prevention off.
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen-1_8B-Chat-Int4", #"Qwen/Qwen-7B-Chat-Int4",
    device_map="auto",
    trust_remote_code=True
).eval()




Solving class example using Qwen LLM

In [None]:
prompt = f"""
Classify the last sentence using the following examples.

<sentence>: it is nice weather
<sentiment>: positive
<sentence>: it is cold
<sentiment>: negative
<sentence>: it is not very cold
<sentiment>: positive
<sentence>: not nice
<sentiment>: negative
<sentence>: very cold
<sentiment>: negative
<sentence>: cold
<sentiment>: negative
<sentence>: very nice weather
<sentiment>: positive
<sentence>: it is not nice cold at all

"""
response, history = model.chat(tokenizer, prompt, history=None)
print(textwrap.fill(response, width=80))

The last sentence can be classified as having a sentiment of negative, but with
some positivity in the description. It uses words like "not nice" and "cold,"
which indicate that there are some negative aspects to the weather. However, the
overall tone of the sentence is still negative due to the repeated use of the
word "not nice."


In [None]:
from transformers import pipeline
import pandas as pd

In [None]:
sentences = ["it is nice weather ", "it is cold ", "it is not very cold ", "not nice ", "very cold ", "cold ", "very nice weather ", "it is not nice cold at all"]

In [None]:
text = "it is not nice cold at all"

Using PLM (bert-finetune-sst2)

In [None]:
classifier = pipeline("text-classification")
outputs = classifier(text)
pd.DataFrame(outputs)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Unnamed: 0,label,score
0,NEGATIVE,0.999766


In [None]:
for s in sentences:
    outputs = classifier(s)
    print(s, outputs[0]['label'], outputs[0]['score'])


it is nice weather  POSITIVE 0.9998561143875122
it is cold  NEGATIVE 0.9996206760406494
it is not very cold  POSITIVE 0.9659685492515564
not nice  NEGATIVE 0.9997665286064148
very cold  NEGATIVE 0.999714195728302
cold  NEGATIVE 0.9997304081916809
very nice weather  POSITIVE 0.9998589754104614
it is not nice cold at all NEGATIVE 0.9997656941413879


Using Qwen to evaluate all the 8 comments

In [None]:
for i, s in enumerate(sentences):
    text = s
    print(i, ":", text)
    prompt = f"""
    What is the sentiment of the following text, which is delimited with triple backticks?

    Give your answer as either "positive" or "negative". If possible, also share the probability of your answer.
    Review text: '''{text}'''
    """
    response, history = model.chat(tokenizer, prompt, history=None)
    print(textwrap.fill(response, width=80))
    print("----------------------------------")

0 : it is nice weather 
The sentiment of the text "it is nice weather" can be considered positive
because the use of words such as "nice" indicates a favorable opinion of the
weather conditions.  Probability: It's difficult to provide an exact probability
without additional context and information about the reader's beliefs and
opinions on weather. However, based on common sentiment analysis techniques, a
positive sentiment score for this text would likely be around 0.75-0.85. This
suggests that a majority of people would consider the weather to be pleasant.
----------------------------------
1 : it is cold 
The sentiment of the text "it is cold" can be interpreted as negative because it
suggests that the weather is unpleasant and uncomfortable. The probability of
this sentiment being positive would be low due to the use of negative words such
as "cold" and "unpleasant."
----------------------------------
2 : it is not very cold 
The sentiment of the text can be determined by analyzing

Loading IMDB dataset

In [None]:
import pandas as pd
import numpy as np

# load the dataset
'''with open('train.csv','r',encoding="utf8") as f:
    document = f.readlines()
f.close()

labels, texts = [], []
for line in document:
    content = line.split()
    label = content[0]
    labels.append(label[-1])
    texts.append(" ".join(content[1:]))
    print(len(labels), len(texts))
'''
df=pd.read_csv("train.csv")
print(df)


                                                  review sentiment
0      SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...  negative
1      If you want mindless action, hot chicks and a ...  positive
2      "The Woman in Black" is easily one of the cree...  positive
3      I can barely find the words to describe how mu...  negative
4      What's in here ?! Let me tell you. It's the pr...  negative
...                                                  ...       ...
29995  I was really looking forward to this show give...  negative
29996  I searched for this movie for years, apparentl...  positive
29997  This is a story of the Winchester Rifle Model ...  positive
29998  this film is in the MANDINGO & DRUM type<br />...  negative
29999  Ha ha. - oh no - what to say about this film? ...  negative

[30000 rows x 2 columns]


In [None]:
for i in range (0,5):
  print(df.iloc[i]['review'],'\n')

SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<br /><br />Sound format: Mono<br /><br />(Black and white)<br /><br />Suffering from 'hornophobia', Ollie embarks on a 'restful' boat trip, but he and Stan get mixed up with an escaped convict (Rychard Cramer). Chaos ensues.<br /><br />This feature length comedy - an OK entry which nonetheless unspools like a mere imitation of Laurel and Hardy's best work - marked the final collaboration between L&H and producer Hal Roach. Episodic in structure, the movie culminates in a memorable ocean voyage after The Boys are taken hostage by villainous Cramer (who shoots a seagull to prove how tough he is!). The gags are OK, but inspiration is lacking, perhaps due to the recruitment of actor-turned-director Gordon Douglas, previously responsible for Ollie's first solo effort in the sound era (ZENOBIA, produced in 1939), but whose work here lacks a measure of pzazz. Fair, but nothing special. L&H regulars Charlie Hall and James Finlayson make guest appear

In [None]:
df.iloc[0:10]['sentiment']

0    negative
1    positive
2    positive
3    negative
4    negative
5    negative
6    positive
7    negative
8    negative
9    negative
Name: sentiment, dtype: object

Preparing BoW using CountVectorizer and TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
cnt_vectorizer = CountVectorizer()
features = cnt_vectorizer.fit_transform(df['review'])
features_nd = features.toarray()
features_nd.shape



(30000, 82737)

In [None]:
print(features_nd)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(features_nd, df['sentiment'], train_size=0.75,random_state=1234)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 2),max_features=100)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 2),max_features=200)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 2),max_features=500)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 2),max_features=2000)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 5),max_features=2000)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cnt_vectorizer = CountVectorizer(ngram_range=(1, 3),max_features=5000)
features = cnt_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
tfidf_vectorizer = TfidfVectorizer()#(ngram_range=(1, 3),max_features=5000)
features = tfidf_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),max_features=5000)
features = tfidf_vectorizer.fit_transform(texts)
features_nd = features.toarray()
print(features_nd.shape)
X_train, X_test, y_train, y_test  = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
clf1 = KNeighborsClassifier()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
X_train2, X_test2, y_train, y_test  = train_test_split(texts, labels, train_size=0.75,random_state=1234)

Evaluating all 2500 test records using PLM (bert-finetuned-sst)

This may take upto 2-3 minutes on your local machine

In [None]:
bert_pred = []
for s in X_test2:
    outputs = classifier(s)
    bert_pred.append(outputs[0]['label'])

In [None]:
#map 'Positive' to 2 and 'Negative' to 1 in bert_pred list
bert_pred2 = []
for s in bert_pred:
    if s == 'POSITIVE':
        bert_pred2.append('2')
    else:
        bert_pred2.append('1')
accuracy_score(y_test, bert_pred2)

In [None]:
# take top 100 records from X_test2
X_test2a = X_test[0:100]

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np

In [5]:

import pandas as pd
df = pd.read_csv("train.csv")
df.head(10)


Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative
5,"This is the story of a maniac cop who, for som...",negative
6,Before I continue forth with the new millenniu...,positive
7,"When Rodney Dangerfield is on a roll, he's hil...",negative
8,Prom Night is shot with the artistic eye someo...,negative
9,"""Destroy All Planets"" winds up settling for 'd...",negative


In [6]:
for i in range (0,5):
  print({i},df.iloc[i]['review'],'\n')

{0} SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<br /><br />Sound format: Mono<br /><br />(Black and white)<br /><br />Suffering from 'hornophobia', Ollie embarks on a 'restful' boat trip, but he and Stan get mixed up with an escaped convict (Rychard Cramer). Chaos ensues.<br /><br />This feature length comedy - an OK entry which nonetheless unspools like a mere imitation of Laurel and Hardy's best work - marked the final collaboration between L&H and producer Hal Roach. Episodic in structure, the movie culminates in a memorable ocean voyage after The Boys are taken hostage by villainous Cramer (who shoots a seagull to prove how tough he is!). The gags are OK, but inspiration is lacking, perhaps due to the recruitment of actor-turned-director Gordon Douglas, previously responsible for Ollie's first solo effort in the sound era (ZENOBIA, produced in 1939), but whose work here lacks a measure of pzazz. Fair, but nothing special. L&H regulars Charlie Hall and James Finlayson make guest ap

In [7]:
df.iloc[0:10]['sentiment']

0    negative
1    positive
2    positive
3    negative
4    negative
5    negative
6    positive
7    negative
8    negative
9    negative
Name: sentiment, dtype: object

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [9]:
def train_model(model, param_grid, vectorizer, ngram_range, max_features):
    cnt_vectorizer = vectorizer(ngram_range=ngram_range, max_features=max_features)
    features = cnt_vectorizer.fit_transform(df['review'])
    features_nd = features.toarray()

    X_train, X_test, y_train, y_test = train_test_split(features_nd, df['sentiment'], train_size=0.75, random_state=1234)

    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    print("Model Name:", model.__class__.__name__, "\n", "Ngram Range:", ngram_range, "\n", "Max Features:", max_features, "\n", "Vectorizer:", vectorizer.__name__, "\n", "Accuracy:", accuracy_score(y_test, y_pred))
    print("Best parameters:", grid_search.best_params_)


In [10]:
RF=RandomForestClassifier()
DT=DecisionTreeClassifier()



In [11]:
param_DT = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
param_RF = {'n_estimators': [50, 100, 150, 200], 'max_depth': [3, 5, 7], 'min_samples_split': [3, 5, 7], 'min_samples_leaf': [16, 32]}

In [None]:
models_params = [
    (DT, param_DT),
    (RF, param_RF),

]

ngram_range = [(1, 2), (1, 3)]
max_features = [500, 1000, 2000, 3000, 4000, 5000]
vectorizers = [CountVectorizer, TfidfVectorizer]


for model, model_params in models_params:
    for vec in vectorizers:
        for ngram in ngram_range:
            for max_feat in max_features:
                train_model(model, model_params, vec, ngram, max_feat)

Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 500 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7169333333333333
Best parameters: {'max_depth': 10}
Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 1000 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7197333333333333
Best parameters: {'max_depth': 10}
Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 2000 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7272
Best parameters: {'max_depth': 10}
Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 3000 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7265333333333334
Best parameters: {'max_depth': 10}
Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 4000 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7257333333333333
Best parameters: {'max_depth': 10}
Model Name: DecisionTreeClassifier 
 Ngram Range: (1, 2) 
 Max Features: 5000 
 Vectorizer: CountVectorizer 
 Accuracy: 0.7270666666666666
Best

Qwen and Zephyr can't be done on local machines (specially laptops) and should be done on Cloud