In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
news_dataset = pd.read_csv("output.csv")
news_dataset['Prediction'] = news_dataset['prediction'].replace({'positive':'1', 'negative':'0', 'neutral':'2', 'none':'3'})

In [4]:
print(news_dataset.shape)

(20278, 3)


In [5]:
print(news_dataset.head())

                                                News  prediction  Prediction
0  Indian energy company ONGC Videsh eyes oil  ga...         0.0         0.0
1  China crude demand rising  but costly Saudi oi...         1.0         1.0
2  Oil prices fall 1% as U.S. crude  fuel invento...         0.0         0.0
3          Russian oil output rises by 1% this month         0.0         0.0
4  Gyrating European gas price forecasts leave co...         0.0         0.0


In [6]:
print(news_dataset.isnull().sum())

News          0
prediction    0
Prediction    0
dtype: int64


In [7]:
print(news_dataset['News'])

0        Indian energy company ONGC Videsh eyes oil  ga...
1        China crude demand rising  but costly Saudi oi...
2        Oil prices fall 1% as U.S. crude  fuel invento...
3                Russian oil output rises by 1% this month
4        Gyrating European gas price forecasts leave co...
                               ...                        
20273    India offers incentives to state-owned oil and...
20274    ICE to launch Houston oil contract  building o...
20275    Brazil to auction 3 mln barrels of pre-salt oi...
20276    Socar’s oil trading arm rolls back expansion a...
20277    Oil rises on bullish demand signals  even as U...
Name: News, Length: 20278, dtype: object


In [8]:
x = news_dataset.drop(columns='Prediction', axis=1)
y = news_dataset['Prediction']

In [9]:
print(x)

                                                    News  prediction
0      Indian energy company ONGC Videsh eyes oil  ga...         0.0
1      China crude demand rising  but costly Saudi oi...         1.0
2      Oil prices fall 1% as U.S. crude  fuel invento...         0.0
3              Russian oil output rises by 1% this month         0.0
4      Gyrating European gas price forecasts leave co...         0.0
...                                                  ...         ...
20273  India offers incentives to state-owned oil and...         0.0
20274  ICE to launch Houston oil contract  building o...         0.0
20275  Brazil to auction 3 mln barrels of pre-salt oi...         1.0
20276  Socar’s oil trading arm rolls back expansion a...         0.0
20277  Oil rises on bullish demand signals  even as U...         0.0

[20278 rows x 2 columns]


In [10]:
print(y)

0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
        ... 
20273    0.0
20274    0.0
20275    1.0
20276    0.0
20277    0.0
Name: Prediction, Length: 20278, dtype: float64


In [11]:
port_stem = PorterStemmer()

In [12]:
def stemming(content): # Line 1
    stemmed_content = re.sub("[^a-zA-Z]", " ", content)  # Line 2
    stemmed_content = stemmed_content.lower()  # Line 3
    stemmed_content = stemmed_content.split()  # Line 4
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]  # 5
    stemmed_content = ' '.join(stemmed_content)  # Line 6
    return stemmed_content  # Returning the processed content

In [13]:
news_dataset['News'] = news_dataset['News'].apply(stemming)

In [14]:
print(news_dataset['News'])

0        indian energi compani ongc videsh eye oil ga h...
1        china crude demand rise costli saudi oil less ...
2              oil price fall u crude fuel inventori swell
3                            russian oil output rise month
4        gyrat european ga price forecast leav compani ...
                               ...                        
20273             india offer incent state own oil ga firm
20274    ice launch houston oil contract build export g...
20275      brazil auction mln barrel pre salt oil aug ppsa
20276     socar oil trade arm roll back expans weak profit
20277    oil rise bullish demand signal even u stockpil...
Name: News, Length: 20278, dtype: object


In [15]:
x = news_dataset['News'].values

In [16]:
y = news_dataset['Prediction'].values

In [17]:
print(x)
print(y)
print(y.shape)

['indian energi compani ongc videsh eye oil ga hot spot africa latin america'
 'china crude demand rise costli saudi oil less desir'
 'oil price fall u crude fuel inventori swell' ...
 'brazil auction mln barrel pre salt oil aug ppsa'
 'socar oil trade arm roll back expans weak profit'
 'oil rise bullish demand signal even u stockpil grow']
[0. 1. 0. ... 1. 0. 0.]
(20278,)


In [18]:
type(x)

numpy.ndarray

In [19]:
x = x.tolist()

In [20]:
vectorizer = TfidfVectorizer()

In [21]:
vectorizer.fit(x) 

TfidfVectorizer()

In [22]:
x = vectorizer.transform(x)
print(x) 

  (0, 6146)	0.42068213349149286
  (0, 5319)	0.2724383011699246
  (0, 3883)	0.33979550085813987
  (0, 3854)	0.06736635845914328
  (0, 3112)	0.34964462943261354
  (0, 2779)	0.29014326500956883
  (0, 2637)	0.3454378208117936
  (0, 2268)	0.14187451703228068
  (0, 1987)	0.2526633488097848
  (0, 1810)	0.16639094507629298
  (0, 1057)	0.22726787846380214
  (0, 172)	0.26160238157562904
  (0, 93)	0.26702686970930356
  (1, 4916)	0.23059888831845182
  (1, 4764)	0.23014251777088446
  (1, 3854)	0.0915174178165663
  (1, 3163)	0.4071525425424571
  (1, 1458)	0.5714980512425205
  (1, 1416)	0.20285729653306614
  (1, 1253)	0.1702477360966784
  (1, 1188)	0.5165556740397272
  (1, 936)	0.2369277995474337
  (2, 5574)	0.6596466210893962
  (2, 4310)	0.24284959529935973
  (2, 3854)	0.12788237603857597
  :	:
  (20275, 4278)	0.3176554676523032
  (20275, 4272)	0.48483996387277056
  (20275, 3854)	0.07764033744549706
  (20275, 3536)	0.2776335174163724
  (20275, 662)	0.2892560320381436
  (20275, 425)	0.243027018417635

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=52)

In [24]:
model = LogisticRegression(solver='saga', max_iter=5000)
model.fit(x_train, y_train)

LogisticRegression(max_iter=5000, solver='saga')

In [25]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [26]:
print("Accuracy score of the training data : ", training_data_accuracy)

Accuracy score of the training data :  0.9064657534246575


In [27]:
# Predict the target variable for the training and testing sets
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate the accuracy for the training and testing sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Training accuracy:', train_accuracy)
print('Testing accuracy:', test_accuracy)

Training accuracy: 0.9064657534246575
Testing accuracy: 0.8441814595660749


In [28]:
from keras_preprocessing.text import Tokenizer

tokenizer = Tokenizer()

def get_prediction1(text):
    main = text
    main = [main]
    main = vectorizer.transform(main)
    prediction = model.predict(main)

    return  prediction


In [30]:
import joblib

joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [None]:
import pickle

# Load the model from a file
filename = 'econocast_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

# Load the vectorizer from file
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)


In [None]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(x_train, y_train)

# Step 5: Evaluate the performance of the classifier
y_pred = clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")