# Get Data

In [1]:
import pandas as pd
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split


cwd = os.path.abspath('./news Classifier/CSV Data for Classifier') 
files = os.listdir(cwd) 
df = pd.DataFrame()
for file in files:
    if file.endswith('.csv'):
        df = df.append(pd.read_csv(cwd+"/"+file), ignore_index=True) 
        
df = df.dropna()
df.columns
X = df.drop(['Unnamed: 0', 'date', 'location', 'news title', 'news source(url)',
       'keywords', 'class_name', 'new_class_name'], axis=1)

y = df['new_class_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train

Unnamed: 0,content summary
12785,"Its flagship product is captagon, an illegal, ..."
10872,The USTR did not say when the tariffs would ta...
5450,BASEL (BLOOMBERG) - Climate change threatens t...
11284,SINGAPORE - The High Court has dismissed a den...
13102,MOSCOW (REUTERS) - President Vladimir Putin sa...
...,...
9761,"JAKARTA/SURABAYA (XINHUA, THE JAKARTA POST/ASI..."
8379,"Washington's North Korea policy is becoming ""m..."
7330,MANILA - A typhoon that gathered devastating s...
4383,"A jump in fuel prices, a dip in port activity ..."


In [2]:
# import the BaseEstimator
import re 
from sklearn.base import BaseEstimator
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# define the class FeatureEngineering
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class FeatureEngineering(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self
    
        
    def transform(self, x_dataset):
        """
        Function: split text into words and return the root form of the words
        Args:
          text(str): the article
        Return:
          lem(list of str): a list of the root form of the article words
        """
        def preprocess(text):

            # Normalize text
            text = re.sub(r"[^a-zA-Z]", " ", str(text).lower())

            # Tokenize text
            token = word_tokenize(text)

            # Remove stop words
            stop = stopwords.words("english")
            new_stop_words_list = ['said', 'us', 'also', 'mr']
            stop.extend(new_stop_words_list)
            words = [t for t in token if t not in stop]

            # Lemmatization
            lem = [WordNetLemmatizer().lemmatize(w) for w in words]

            return lem
        
        x_dataset.head()

        x_dataset["Preprocessed_Text"] = x_dataset['content summary'].apply(lambda x: preprocess(x))
        x_dataset['Preprocessed_Text2'] = x_dataset['Preprocessed_Text'].apply(' '.join)
        
        
        return x_dataset

In [12]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from mlxtend.feature_selection import ColumnSelector


preprocessor = Pipeline(steps=[('feature engineering', FeatureEngineering()), 
                               ('col_selector', ColumnSelector(cols=('Preprocessed_Text2'),drop_axis=True)),
                               ('tfidf',TfidfVectorizer()),
                            ])


train_features = preprocessor.fit(X_train)
test_features = preprocessor.fit(X_test)

train_features = train_features.transform(X_train)
test_features = test_features.transform(X_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_dataset["Preprocessed_Text"] = x_dataset['content summary'].apply(lambda x: preprocess(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_dataset['Preprocessed_Text2'] = x_dataset['Preprocessed_Text'].apply(' '.join)


In [13]:
train_features

<10980x18643 sparse matrix of type '<class 'numpy.float64'>'
	with 597456 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost 
from sklearn.metrics  import classification_report
from sklearn import metrics
import time
import pickle

def fit_eval_model(cls_name,model, train_features, y_train, test_features, y_test):
    
    """
    Function: train and evaluate a machine learning classifier.
    Args:
      model: machine learning classifier
      train_features: train data extracted features
      y_train: train data lables
      test_features: train data extracted features
      y_test: train data lables
    Return:
      results(dictionary): a dictionary of the model training time and classification report
    """
    results ={}
    
    # Start time
    start = time.time()
    # Train the model
    model.fit(train_features, y_train)
    # End time
    end = time.time()
    # Calculate the training time
    results['train_time'] = end - start
    
    # Test the model
    train_predicted = model.predict(train_features)
    test_predicted = model.predict(test_features)
    
    # Save the model
    filename = cls_name + '.sav'
    pickle.dump(model, open(filename, 'wb'))
    
    # Classification report
    results['classification_report'] = classification_report(y_test, test_predicted)
        
    return results

In [23]:
# sv = svm.SVC()
# ab = AdaBoostClassifier(random_state = 1)
# gb = GradientBoostingClassifier(random_state = 1)
xgb = xgboost.XGBClassifier(random_state = 1)
# tree = DecisionTreeClassifier()
# nb = MultinomialNB()


# Fit and evaluate models
results = {}
# for cls in [sv, ab, gb, xgb, tree, nb]:
for cls in [xgb]:
    cls_name = cls.__class__.__name__
    results[cls_name] = {}
    results[cls_name] = fit_eval_model(cls_name,cls, train_features, y_train, test_features, y_test)





In [21]:
for res in results:
    print (res)
    print()
    for i in results[res]:
        print (i, ':')
        print(results[res][i])
        print()
    print ('-----')
    print()

XGBClassifier

train_time :
35.16069579124451

classification_report :
                                          precision    recall  f1-score   support

    Acute climatological event (cyclone)       0.68      0.53      0.60       188
   Acute climatological event (droughts)       0.60      0.65      0.62       344
      Acute climatological event (flood)       0.71      0.68      0.69       231
Acute climatological event (heat stress)       0.56      0.30      0.39       114
                         Economic Crisis       0.57      0.66      0.61       189
                       Man-Made Disaster       0.80      0.79      0.80       160
                      Military Conflicts       0.66      0.68      0.67       210
                               Terrorism       0.92      0.90      0.91       199
                           Trade Dispute       0.66      0.65      0.65       167
                             cyberattack       0.95      0.92      0.94       256
                       geo

ERROR: Could not find a version that satisfies the requirement pickle
ERROR: No matching distribution found for pickle


In [None]:
# load the model from disk
loaded_model = pickle.load(open("XGBClassifier.sav", 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)