In [36]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [37]:
# Folder sub-directories
direct = os.getcwd()
categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

In [38]:
# Generate dataframe for articles
dataframe = []

for i, category in enumerate(categories):  # Loop for each sub-directory

    curr_directory = os.path.join(direct, category)

    for file in os.listdir(curr_directory):  # Loop for each article in category

        with open(os.path.join(curr_directory, file)) as f:  
            content = f.read()

        curr_entry = {'Category': i, 'Content': content}  # Create dictionary for dataframe

        dataframe.append(curr_entry)


In [39]:
# Create dictionary from inputs
df = pd.DataFrame(dataframe)
df

Unnamed: 0,Category,Content
0,0,Ad sales boost Time Warner profit\n\nQuarterly...
1,0,Dollar gains on Greenspan speech\n\nThe dollar...
2,0,Yukos unit buyer faces loan claim\n\nThe owner...
3,0,High fuel prices hit BA's profits\n\nBritish A...
4,0,Pernod takeover talk lifts Domecq\n\nShares in...
...,...,...
2220,4,BT program to beat dialler scams\n\nBT is intr...
2221,4,Spam e-mails tempt net shoppers\n\nComputer us...
2222,4,Be careful how you code\n\nA new European dire...
2223,4,US cyber security chief resigns\n\nThe man mak...


In [40]:
# Clean the input text for tfidf vectorization
df['Content_Cleaned'] = df['Content'].str.replace('\r', ' ')                                 # Remove line separators
df['Content_Cleaned'] = df['Content_Cleaned'].str.replace('\n', ' ')                         # Remove new line character
df['Content_Cleaned'] = df['Content_Cleaned'].apply(lambda x: " ".join(x.split()))           # Remove multiple spaces in a row
df['Content_Cleaned'] = df['Content_Cleaned'].apply(lambda x: re.sub(r'\[[0-9]*]', ' ', x))  # Remove in-text citations ([0], etc)

df['Content_Cleaned'] = df['Content_Cleaned'].str.replace('"', '')                           # Remove quotation marks
df['Content_Cleaned'] = df['Content_Cleaned'].str.lower()                                    # Lowercase

df['Content_Cleaned'] = df['Content_Cleaned'].str.replace("'s", '')                          # Remove possessive form

# Remove common punctuations
punctuations = ['.,:;!?'] 

for punctuation in punctuations: 
    df['Content_Cleaned'] = df['Content_Cleaned'].str.replace('punctuation', '') 


In [41]:
nltk.download('punkt')
nltk.download('wordnet')

# Load Stopwords
stop_file = open('stopwords-en.txt', 'r')
stop_words = stop_file.read().split('\n')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samso\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [43]:
lemmatized_text_list = []

for row in range(len(df)):
    
    # Create an empty list containing lemmatized words
    curr_lemmatized = []
    text = df.loc[row]['Content_Cleaned']
    
    text_words = text.split(" ")  # Tokenize text
 
    # Iterate through every word to lemmatize
    for word in text_words:
 
        if word not in stop_words: # Ignore stop-words

            curr_lemmatized.append(lemmatizer.lemmatize(word, pos="v"))
        
    # Rejoin text 
    lemmatized_text = " ".join(curr_lemmatized)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

df['Content_Lemmatized'] = lemmatized_text_list

In [44]:
# Set up training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(df['Content_Lemmatized'], df['Category'], test_size=0.15, random_state=0)

In [50]:
# Parameter Selection
ngram_range = (1, 2)  # Include unigram, bigrams
min_df = 10
max_df = 1.0
max_features = 300 # Max 300 words

In [51]:
# Inititalize tfidf vectorizer
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
# Train the tfidf vectorizer on training data
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

# Use vectorizer on testing data
features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(1891, 300)
(334, 300)


In [52]:
from sklearn import svm

SVM = svm.SVC(random_state=0)

# Possible Regularization Parameters
C = [.0001, .001, .01, .1, 1]

# Possible Gamma Parameters
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# Possible Degree Parameters for Polynomial
degree = [1, 2, 3, 4, 5]

# Possible Kernel Parameters
kernel = ['linear', 'rbf', 'poly']

probability = [True]

In [53]:
from sklearn.model_selection import RandomizedSearchCV

# Perform Random Search
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

random_search = RandomizedSearchCV(estimator=SVM,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=0)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=SVC(random_state=0), n_iter=50,
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1],
                                        'degree': [1, 2, 3, 4, 5],
                                        'gamma': [0.0001, 0.001, 0.01, 0.1, 1,
                                                  10, 100],
                                        'kernel': ['linear', 'rbf', 'poly'],
                                        'probability': [True]},
                   random_state=0, scoring='accuracy', verbose=1)

In [54]:
print("Best Parameters:", random_search.best_params_)
print("Accuracy:", random_search.best_score_)

Best Parameters: {'probability': True, 'kernel': 'poly', 'gamma': 100, 'degree': 3, 'C': 0.01}
Accuracy: 0.9518795227194593


In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

# Use results from random search to define grid search
param_grid = [
  {'C': C, 'kernel':['linear'], 'probability': probability},
  {'C': C, 'kernel':['poly'], 'degree': degree, 'probability': probability},
  {'C': C, 'kernel':['rbf'], 'gamma': gamma, 'probability': probability}
]

# Create a base model
svc = svm.SVC(random_state=0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=ShuffleSplit(n_splits = 3, test_size = .33, random_state = 0),
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 65 candidates, totalling 195 fits


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=0, test_size=0.33, train_size=None),
             estimator=SVC(random_state=0),
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1],
                          'kernel': ['linear'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1, 1],
                          'degree': [1, 2, 3, 4, 5], 'kernel': ['poly'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1, 1],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'kernel': ['rbf'], 'probability': [True]}],
             scoring='accuracy', verbose=1)

In [56]:
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'degree': 2, 'kernel': 'poly', 'probability': True}
Accuracy: 0.9557333333333333


In [57]:
# Use grid search results
best_SVM = grid_search.best_estimator_
best_SVM

SVC(C=1, degree=2, kernel='poly', probability=True, random_state=0)

In [58]:
# Train the predict
best_SVM.fit(features_train, labels_train)
svc_pred = best_SVM.predict(features_test)

In [59]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Training accuracy
print("The training accuracy for model is: ", accuracy_score(labels_train, best_SVM.predict(features_train)))

The training accuracy is:  1.0


In [60]:
# Test accuracy
print("The test accuracy for model is: ", accuracy_score(labels_test, svc_pred))

The test accuracy is:  0.9550898203592815


In [61]:
# Export the models 
import pickle

with open('tfidf', 'wb') as output:
    pickle.dump(tfidf, output)
    
with open('SVM', 'wb') as output:
    pickle.dump(best_SVM, output)
