# Model
1. In this notebook all the csv files of the banks are imported and concatenated.
2. This concatenated data is mixed to avoid training bank news serially => Better training
3. Finally, the model trained using Naive Bayes classifier and linear regression, which is saved in a pickle format, which is used by the API.
4. We are Naive Bayes for our trial as it provides better results
4. There are only two classes to this model:
    a) 0 for negative news => Negative News
    b) 1 for positve news => Positive News
      

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import string
from nltk.corpus import stopwords
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
def data():
    """
    The Data() function:    
    1. Combines and shuffles the data of all the banks.
    2. Converts the label to an integer.
    3. Returns the combined data with labels
    """
    
    axis_df = pd.read_csv('Axis.csv')
    hdfc_df = pd.read_csv('HDFC.csv')
    sbi_df = pd.read_csv('State.csv')
    ind_df = pd.read_csv('IndusInd.csv')
    icici_df = pd.read_csv('ICICI.csv')

    df = [axis_df, hdfc_df,sbi_df,ind_df,icici_df]
    result = pd.concat(df)
    
    result = result.sample(frac=1).reset_index(drop=True)
    result['label']= result['label'].astype('int')
   
    return result

In [3]:
print(data())

      label                                               news
0         0  State-owned banks India barred employees posti...
1         1  ICICIBC realised ~Rs 31bn 1QFY21 far, stake sa...
2         1  Country’s largest private lender witnessed imp...
3         1  Ashwani Gujral ashwanigujral.com told CNBC-TV1...
4         0  Country's largest lender (SBI) considered patr...
...     ...                                                ...
1896      0  HDFC Bank Ltd informed BSE subject approval Re...
1897      1  HDFC Bank Ltd informed BSE Bank August 30, 201...
1898      0  Asset quality concerns continue haunt Indian b...
1899      0  , India's fourth-largest private sector lender...
1900      0  , Mastercard SAP Concur joined hands spending ...

[1901 rows x 2 columns]


In [4]:
#creating a text cleaning function to pass into countvectorizer
def text_cleaner(news):
    """
    The function text_cleaner:
    1. Removes punctuations
    2. Removes Stopwords
    """
     # Check characters to see if they are in punctuation
    nopunc = [char for char in news if char not in string.punctuation]
      
    
    
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
  

    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]



In [5]:
def data_info(result):
    
    # Vectorization of words, Might take awhile...
    bow_transformer = CountVectorizer(analyzer=text_cleaner).fit(result['news'])

    # Print total number of vocab words
    print('Total number of unique words:',len(bow_transformer.vocabulary_))

    #Tokenization or vectorization of all the generated news
    news_bow = bow_transformer.transform(result['news'])

    print('Shape of Sparse Matrix: ', news_bow.shape)
    print('Amount of Non-Zero occurences: ', news_bow.nnz)

    #calculating sparsity
    sparsity = (100.0 * news_bow.nnz / (news_bow.shape[0] * news_bow.shape[1]))
    print('sparsity: {}'.format(sparsity))
    
    
    print(result.groupby(by='label').count())
    print('percentage positive news',(870/len(result))*100 )
    print('percentage negative news',(1031/len(result))*100)

In [6]:
data_info(data())

Total number of unique words: 44893
Shape of Sparse Matrix:  (1901, 44893)
Amount of Non-Zero occurences:  361680
sparsity: 0.42380272887570775
       news
label      
0      1031
1       870
percentage positive news 45.765386638611254
percentage negative news 54.23461336138874


In [7]:
#df = data()

#print(df)


#df['length_of_news']=df['news'].apply(len)
#print(df.head())
#df.hist(column='length_of_news', by='label' )
#df.countplot
#df['news'][1]

In [8]:
# creating train and test data:
def model_training_with_testdata(result,model_name='nb'):
    X_train, X_test, y_train, y_test = train_test_split(result['news'], result['label'], test_size=0.3)
    print('length of training data',len(X_train),'and length of the test data',len(X_test),'and total data',len(X_train) + len(X_test))
    
    if model_name.lower()=='nb':
        pipe = Pipeline([
        ('bow', CountVectorizer(analyzer=text_cleaner)),  # strings to token integer counts
        ('classifier', MultinomialNB()),])  # train on TF-IDF vectors w/ Naive Bayes classifier
        pipe.fit(X_train,y_train)
        predictions = pipe.predict(X_test)
        print('predictions',predictions[0:5])
        print('labels',y_test[0:5])
        print(classification_report(predictions,y_test)) # randomly selected data lacks positive news! This is NO
    
    elif model_name.lower()=='lr':
        pipe=Pipeline([('bow', CountVectorizer(analyzer=text_cleaner)),  # strings to token integer counts
       # ('StandardScaler',StandardScaler()),  # integer counts to weighted TF-IDF scores
        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
        ])
   
        pipe.fit(X_train, y_train)  # apply scaling on training data
        print("Accuracy:", pipe.score(X_test, y_test))
        predictions = pipe.predict(X_test)
        print(classification_report(predictions,y_test))
   

In [9]:
model_training_with_testdata(data())   

length of training data 1330 and length of the test data 571 and total data 1901
predictions [0 1 1 0 0]
labels 1121    0
1413    0
669     0
369     0
68      0
Name: label, dtype: int64
              precision    recall  f1-score   support

           0       0.67      0.62      0.64       341
           1       0.49      0.54      0.51       230

    accuracy                           0.59       571
   macro avg       0.58      0.58      0.58       571
weighted avg       0.59      0.59      0.59       571



In [10]:
model_training_with_testdata(data(),model_name='lr')

length of training data 1330 and length of the test data 571 and total data 1901


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.5569176882661997
              precision    recall  f1-score   support

           0       0.62      0.60      0.61       331
           1       0.47      0.50      0.49       240

    accuracy                           0.56       571
   macro avg       0.55      0.55      0.55       571
weighted avg       0.56      0.56      0.56       571



In [14]:
def model_training_fulldata(result,model_name='nb'):
    if model_name.lower()=='nb':
        pipe = Pipeline([
        ('bow', CountVectorizer(analyzer=text_cleaner)),  # strings to token integer counts
        ('classifier', MultinomialNB()),])  # train on TF-IDF vectors w/ Naive Bayes classifier
        pipe.fit(result['news'],result['label'])
        filename = 'model_nb.sav'
        pickle.dump(pipe, open(filename, 'wb'))
    
        predictions = pipe.predict(result['news'])
        print('predictions',predictions[0:5])
        
        print(classification_report(predictions,result['label'])) # randomly selected data lacks positive news! This is NO
    
    elif model_name.lower()=='lr':
        pipe=Pipeline([('bow', CountVectorizer(analyzer=text_cleaner)),  # strings to token integer counts
       # ('StandardScaler',StandardScaler()), 
        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
        ])
   
        pipe.fit(result['news'], result['label'])  # apply scaling on training data
        filename = 'model_lr.sav'
        pickle.dump(pipe, open(filename, 'wb'))
        print("Accuracy:", pipe.score(result['news'], result['label']))
        predictions = pipe.predict(result['news'])
        print(classification_report(predictions,result['label']))
   
    

In [12]:
model_training_fulldata(data()) # this result is only to check if the function is working or not

predictions [1 1 0 0 0]
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1053
           1       0.95      0.98      0.96       848

    accuracy                           0.97      1901
   macro avg       0.97      0.97      0.97      1901
weighted avg       0.97      0.97      0.97      1901



In [15]:
model_training_fulldata(data(),model_name='lr') # This is only to to check if the function is working

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.9984218832193582
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1034
           1       1.00      1.00      1.00       867

    accuracy                           1.00      1901
   macro avg       1.00      1.00      1.00      1901
weighted avg       1.00      1.00      1.00      1901

