In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
pd.set_option("display.max_colwidth", 200)

In [2]:
data = pd.read_pickle("headline_cleaned.pkl")
data.head(50)

Unnamed: 0,headline_text,senti_label,cleaned_headline_w/o_SW,cleaned_headline_with_SW
0,act fire witnesses must be aware of defamation,0,act fire witness must be aware of defamation,act witness aware defamation
1,air nz staff in aust strike for pay rise,0,air nz staff in aust strike for pay rise,air staff aust strike pay rise
2,air nz strike to affect australian travellers,0,air nz strike to affect australian traveller,air strike affect australian traveller
3,ambitious olsson wins triple jump,1,ambitious olsson win triple jump,ambitious olsson win triple jump
4,antic delighted with record breaking barca,1,antic delighted with record breaking barca,antic delighted record breaking barca
5,aust addresses un security council over iraq,1,aust address un security council over iraq,aust address security council iraq
6,australia is locked into war timetable opp,0,australia is locked into war timetable opp,australia locked war timetable opp
7,barca take record as robson celebrates birthday in,1,barca take record a robson celebrates birthday in,barca record robson celebrates birthday
8,big hopes for launceston cycling championship,1,big hope for launceston cycling championship,big hope launceston cycling championship
9,big plan to boost paroo water supplies,1,big plan to boost paroo water supply,big plan boost paroo water supply


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline

In [10]:
X = data['cleaned_headline_with_SW']
y = data['senti_label']

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

CV = CountVectorizer(min_df=10, max_features=1000)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)

CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )

results = cross_val_score(CV_pipe, X, y, cv=kfold, scoring='accuracy')
print(np.round((results.mean())*100, 2), np.round((results.std())*100, 2)) 

CV_pipe.fit(X,y)
# Save the trained model
#joblib.dump(CV_pipe, 'sentiment_analysis_model.pkl')
len(CV_pipe['CV'].vocabulary_) 

88.21 0.22


1000

In [13]:
test_headline = ["2024 Hyundai Creta receives 1 lakh bookings 3 months Check features variants high demand DETAILS "]
print(CV_pipe.predict(test_headline))

loaded_model = joblib.load('sentiment_analysis_model.pkl')

# Now you can use loaded_model for prediction
#test_headline = ["One in five BEST buses now electric make up 1.6% of all"]
predicted_label = loaded_model.predict(test_headline)
print(predicted_label)

[0]
[0]


In [36]:
import pickle
with open(r'LR_Pipeline.pickle', 'wb') as handle:
	pickle.dump(CV_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
with open(r'LR_Pipeline.pickle', 'rb') as handle:
    LR_pipeline = pickle.load(handle)

In [38]:
LR_pipeline.predict(test_headline)

array([1], dtype=int64)

In [39]:
import keras
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
import pydot
import keras.backend as K

In [40]:
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(64, input_shape=(n_words,), activation='relu', kernel_regularizer='l2'))
#     model.add(Dropout(0.25))
    model.add(Dense(32, activation='relu', kernel_regularizer='l2'))
#     model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # summarize defined model
    model.summary()    
    plot_model(model, to_file='model1.png', show_shapes=True)
    return model

In [41]:
# fit a tokenizer
from keras_preprocessing.text import Tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer() # num_words=1000
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [42]:
# create the tokenizer
train_docs = data['cleaned_headline_with_SW']
tokenizer = create_tokenizer(train_docs)

In [43]:
X = tokenizer.texts_to_matrix(train_docs, mode='binary')

In [44]:
import pickle
with open(r'tokenizer.pickle', 'wb') as handle:
	pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
X.shape

(50000, 16535)

In [46]:
y = data['senti_label']

In [47]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,  test_size=0.2,random_state=42) 

MemoryError: Unable to allocate 1.23 GiB for an array with shape (10000, 16535) and data type float64

In [None]:
Xtrain.shape, Xtest.shape

In [None]:
# define network
K.clear_session() 
n_words = Xtrain.shape[1]
model = define_model(n_words)

In [None]:
# fit network
checkpoint = ModelCheckpoint("BestModel.keras",
                             monitor="val_loss", 
                             mode="min",
                             save_best_only = True,
                             verbose=1)
callbacks = [checkpoint]

h = model.fit(Xtrain, ytrain, validation_split=0.1, callbacks = callbacks, batch_size=180, epochs=25, verbose=2)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(list(range(1,26)), h.history['loss'], label='Training loss')
plt.plot(list(range(1,26)), h.history['val_loss'], label='Validation loss')
plt.legend(fontsize=15)
plt.grid()
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Loss Fn valaues', fontsize=15)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(list(range(1,26)), h.history['accuracy'], label='Training Accuracy')
plt.plot(list(range(1,26)), h.history['val_accuracy'], label='Validation Accuracy')
plt.legend(fontsize=15)
plt.grid()
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)

In [None]:
# evaluate the model ONCe on the test set
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

In [None]:
test_headline = ["degraded quality food government school"]
#test_tweeet = ["love talk makememories unplug relax iphone smartphone wifi connect"]
test_headline_vec = tokenizer.texts_to_matrix(test_headline, mode='binary')
test_headline_vec.shape

In [None]:
import keras
best_model = keras.models.load_model('BestModel.keras')
print(best_model)

In [None]:
prediction = best_model.predict(test_headline_vec)
print(prediction)