# Imports

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
tqdm.pandas()
from preprocess import clean_text
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, fbeta_score, precision_recall_curve, precision_score, recall_score, roc_auc_score, roc_curve, log_loss,ConfusionMatrixDisplay,plot_precision_recall_curve
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Reading inputs

In [3]:
data_train = pd.read_excel("../data/Data_Train.xlsx")
data_test = pd.read_excel("../data/Data_Test.xlsx")

In [4]:
data_train.head(5)

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3.0
1,How formidable is the opposition alliance amon...,0.0
2,Most Asian currencies were trading lower today...,3.0
3,"If you want to answer any question, click on ‘...",1.0
4,"In global markets, gold prices edged up today ...",3.0


In [5]:
data_test.head(5)

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


## Data Cleaning

In [6]:
data_train['Clean_STORY'] = data_train['STORY'].progress_apply(clean_text) 


100%|██████████| 7628/7628 [01:27<00:00, 87.56it/s] 


## Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = data_train.drop(columns = ['SECTION'])
y = data_train['SECTION']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state = 42,stratify = y)

In [9]:
X_train.shape

(6102, 2)

In [10]:
X_test.shape

(1526, 2)

In [11]:
y_train.value_counts(normalize=True)

1.0    0.363324
2.0    0.252212
0.0    0.221075
3.0    0.163389
Name: SECTION, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

1.0    0.363696
2.0    0.252294
0.0    0.220839
3.0    0.163172
Name: SECTION, dtype: float64

In [13]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('lr',LogisticRegression())
])

In [14]:
pipeline.fit(X['Clean_STORY'],y)

In [15]:
pipeline.predict(X['Clean_STORY'])

array([3., 0., 3., ..., 1., 0., 2.])

In [16]:
pipeline.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'lr': LogisticRegression(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': '

In [17]:
param_grid = {'tfidf__max_features':[1000,2000,2700,3000,3500],
              'lr__C':[0.01,0.1,10],
              }

grid = GridSearchCV(pipeline,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)

In [18]:
(X_train.shape,y_train.shape)

((6102, 2), (6102,))

In [19]:
grid.fit(X_train['Clean_STORY'],y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
grid.best_estimator_

In [23]:
best_grid_est = grid.best_estimator_.predict(X_test['Clean_STORY'])
print(classification_report(y_test,best_grid_est))

              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97       337
         1.0       0.97      0.97      0.97       555
         2.0       0.96      0.97      0.97       385
         3.0       0.97      0.96      0.96       249

    accuracy                           0.97      1526
   macro avg       0.97      0.97      0.97      1526
weighted avg       0.97      0.97      0.97      1526



In [25]:
import pickle

grid_file = open('../models/news_classification_app_lr.pkl','wb')

pickle.dump(grid.best_estimator_,grid_file)

grid_file.close()