In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix

In [None]:
df =pd.DataFrame({'Data':['ML is subset of AI','AI is superset of Both ML and DL,ML and DL','GenAI is used to create or generate any type of content']})

In [None]:
df

Unnamed: 0,Data
0,ML is subset of AI
1,"AI is superset of Both ML and DL,ML and DL"
2,GenAI is used to create or generate any type o...


In [None]:
df1 =pd.DataFrame({'Data':['RL is subset of DL','AI is superset of Both ML and DL,ML and DL','GenAI is used to create or generate any type of content']})

In [None]:
model=CountVectorizer(binary=True) #it is mainly used in bernally naive bayes
res=model.fit_transform(df[['Data']])#it accepts only 1D

In [None]:
model=CountVectorizer(binary=True,max_df=3,min_df=2)
res=model.fit_transform(df['Data']+df1['Data'])#if we have 2 sentence based features

In [None]:
res.toarray()

array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 0, 1]])

In [None]:
cols=model.get_feature_names_out()
cols

array(['dl', 'is', 'ml', 'of'], dtype=object)

In [None]:
model.vocabulary_

{'ml': 2, 'is': 1, 'of': 3, 'dl': 0}

In [None]:
model1=TfidfVectorizer(binary=True,max_features=3)
res1=model1.fit_transform(df['Data']+df1['Data'])

In [None]:
res1.toarray()

array([[0.52284231, 0.67325467, 0.52284231],
       [0.52284231, 0.67325467, 0.52284231],
       [0.70710678, 0.        , 0.70710678]])

In [None]:
model1=TfidfVectorizer(binary=True,max_features=3,min_df=2,stop_words='english')
res2=model1.fit_transform(df['Data'])

In [None]:
col=model1.get_feature_names_out()
col

array(['ai', 'ml'], dtype=object)

In [None]:
res2.toarray()

array([[0.70710678, 0.70710678],
       [0.70710678, 0.70710678],
       [0.        , 0.        ]])

In [None]:
model1.vocabulary_

{'ml': np.int64(1), 'ai': np.int64(0)}

#STOCKIFY dataset

In [None]:
df1=pd.read_csv("/content/stoctify.csv")
df1.head(4)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [None]:
column=df1.columns
column

Index(['Sentence', 'Sentiment'], dtype='object')

In [None]:
df1=df1.dropna()

In [None]:
for i in column:
  print(i,"-",df1[i].nunique())

Sentence - 5322
Sentiment - 3


In [None]:
x=df1[['Sentence']]
y=df1['Sentiment']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('vectorizer',TfidfVectorizer(max_features=3000,min_df=5,max_df=0.9,stop_words='english'),'Sentence')
    ]
)

In [None]:
pipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',LogisticRegression())
    ]
)

In [None]:
pipeline.fit(xtrain, ytrain)


In [None]:
pipeline.score(xtrain,ytrain)

0.7986304301305371

In [None]:
pipeline.score(xtest,ytest)

0.7014542343883661

In [None]:
y_pred=pipeline.predict(xtrain)

In [None]:
ypred=pipeline.predict(xtest)

In [None]:
confusion_matrix(ytrain,y_pred)

array([[ 219,  387,   79],
       [  66, 2364,   78],
       [  17,  314, 1149]])

In [None]:
confusion_matrix(ytest,ypred)

array([[ 31,  99,  45],
       [ 23, 554,  45],
       [  9, 128, 235]])

#Without using columntransformer

In [None]:
model=CountVectorizer(binary=True,stop_words='english')
res=model.fit_transform(df1['Sentence'])

In [None]:
res.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
cols=model.get_feature_names_out()
cols

array(['00', '000', '000063', ..., 'ñskyl', 'óeur', 'úo'], dtype=object)

In [None]:
model.vocabulary_

{'geosolutions': 4574,
 'technology': 9925,
 'leverage': 6016,
 'benefon': 1650,
 'gps': 4666,
 'solutions': 9322,
 'providing': 7959,
 'location': 6142,
 'based': 1569,
 'search': 8905,
 'communities': 2487,
 'platform': 7623,
 'relevant': 8333,
 'multimedia': 6765,
 'content': 2657,
 'new': 6890,
 'powerful': 7750,
 'commercial': 2465,
 'model': 6652,
 'esi': 3700,
 'lows': 6208,
 '50': 532,
 'bk': 1726,
 'real': 8179,
 'possibility': 7724,
 'quarter': 8048,
 '2010': 255,
 'componenta': 2546,
 'net': 6876,
 'sales': 8758,
 'doubled': 3303,
 'eur131m': 3758,
 'eur76m': 3876,
 'period': 7482,
 'year': 11021,
 'earlier': 3420,
 'moved': 6734,
 'zero': 11070,
 'pre': 7768,
 'tax': 9900,
 'profit': 7884,
 'loss': 6188,
 'eur7m': 3877,
 'according': 884,
 'finnish': 4245,
 'russian': 8696,
 'chamber': 2230,
 'commerce': 2464,
 'major': 6302,
 'construction': 2640,
 'companies': 2490,
 'finland': 4240,
 'operating': 7149,
 'russia': 8695,
 'swedish': 9774,
 'buyout': 2001,
 'firm': 4255,
 '