In [1]:
#import libraries
import numpy as np
import pandas as pd
import re
import sqlite3
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
#load the data into a dataframe
conn = sqlite3.connect('Project 01 - Database.db')
df = pd.read_sql("SELECT * FROM Article", conn)
conn.close()

In [3]:
#show the number of articles in each category
df.category.value_counts()

Unknown          1000
Sports            294
Business          270
Politics          239
Technology        225
Entertainment     197
Name: category, dtype: int64

In [4]:
#define a function that will clean the raw input text in preparation for analysis. Returns a tuple containing
#both the cleaned text and the total number of words in the cleaned text.
def get_clean_text(raw_text):
  #find any period-separated acronyms (e.g., 'U.S.A', 'L.A.', etc.)
  period_separated_acronyms = re.findall(r'(?:[A-Z]\.){2,}', raw_text)
  #remove periods from any period-separated acronyms
  for i in range(len(period_separated_acronyms)):
    acronym = period_separated_acronyms[i].replace('.', '')
    raw_text = raw_text.replace(period_separated_acronyms[i], acronym)
  #remove all numbers from the text using a regular expression
  text = re.sub(r'[0-9]', ' ', raw_text)
  #remove all underscores from the text
  text = re.sub(r'\_', ' ', text)
  #remove anything else in the text that isn't a word character or a space (e.g., punctuation, special symbols, etc.)
  text = re.sub(r'[^\w\s]', ' ', text)
  #remove any excess whitespace
  for _ in range(10):
    text = text.replace('  ', ' ')
  #remove any leading or trailing space characters
  text = text.strip()
  #split the text into a list of words
  words = text.split()
  #convert all non-acronyms to lowercase
  for i in range(len(words)): #for each index in the words collection
    word = words[i] #define the current word
    if len(word) > 1 and len(word) < 7: #if this word is two to six characters long
      if word.isupper() == False: #if at least one character in this word is not uppercase
        #this word is not an acronym because it is not all uppercase, so convert it to lowercase
        words[i] = word.lower()
    else: #this word is not an acronym because it consists of one letter or more than six letters, so convert it to lowercase
      words[i] = word.lower()
  #return the cleaned text and the number of words in the cleaned text
  return (' '.join(words), len(words))

In [5]:
#clean raw text for each article
df[['clean_text', 'total_words']] = [get_clean_text(raw_text) for raw_text in df.raw_text]

#preview data
df.tail()

Unnamed: 0,id,category,raw_text,clean_text,total_words
2220,9981403,Technology,The US Federal Bureau of Investigation is warn...,the US federal bureau of investigation is warn...,220
2221,9983844,Sports,Scrum-half Matt Dawson is an injury doubt for ...,scrum half matt dawson is an injury doubt for ...,155
2222,9986243,Unknown,"The US economy has grown more than expected, e...",the US economy has grown more than expected ex...,197
2223,9987006,Unknown,"Three years after a gruelling economic crisis,...",three years after a gruelling economic crisis ...,849
2224,9988130,Technology,BBC Sport unveils its new analysis tool Piero ...,BBC sport unveils its new analysis tool piero ...,460


In [6]:
#build TF-IDF vectors for each article
vectorizer = TfidfVectorizer(lowercase=False)
article_tfidf_scores = np.array(vectorizer.fit_transform(df.clean_text).todense())
df['tfidf_scores'] = [tfidf_scores for tfidf_scores in article_tfidf_scores]

In [7]:
#recode textual category labels into numbers
recode = {'Business': 0, 'Entertainment': 1, 'Politics': 2, 'Sports': 3, 'Technology': 4, 'Unknown': -1}
df.category = [recode[category] for category in df.category]
df.tail()

Unnamed: 0,id,category,raw_text,clean_text,total_words,tfidf_scores
2220,9981403,4,The US Federal Bureau of Investigation is warn...,the US federal bureau of investigation is warn...,220,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2221,9983844,3,Scrum-half Matt Dawson is an injury doubt for ...,scrum half matt dawson is an injury doubt for ...,155,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2222,9986243,-1,"The US economy has grown more than expected, e...",the US economy has grown more than expected ex...,197,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2223,9987006,-1,"Three years after a gruelling economic crisis,...",three years after a gruelling economic crisis ...,849,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2224,9988130,4,BBC Sport unveils its new analysis tool Piero ...,BBC sport unveils its new analysis tool piero ...,460,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
df.category.value_counts()

-1    1000
 3     294
 0     270
 2     239
 4     225
 1     197
Name: category, dtype: int64

In [9]:
#split data into training and testing sets
df_predict = df[df.category == -1]
df = df[df.category != -1]

In [10]:
#estimate the model
#model = KNeighborsClassifier(n_neighbors=3)
model = GradientBoostingClassifier()
model.fit(df.tfidf_scores.to_list(), df.category)

In [11]:
#get predictions for the test set
df_predict['category'] = model.predict(df_predict.tfidf_scores.to_list())
df_predict

Unnamed: 0,id,category,raw_text,clean_text,total_words,tfidf_scores
2,14775,4,"If you have finished Doom 3, Half Life 2 and H...",if you have finished doom half life and halo d...,499,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,16641,2,Controversial new UK casinos will be banned fr...,controversial new UK casinos will be banned fr...,274,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,17511,3,Justine Henin-Hardenne lost to Elena Dementiev...,justine henin hardenne lost to elena dementiev...,318,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,19326,0,The two most senior executives at US mortgage ...,the two most senior executives at US mortgage ...,388,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,19971,3,Munster's Heineken Cup quarter-final tie again...,munster s heineken cup quarter final tie again...,148,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
2214,9951082,1,The Incredibles movie has beaten Shrek 2 to th...,the incredibles movie has beaten shrek to the ...,171,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2215,9952845,0,The Karachi Stock Exchange (KSE) has recorded ...,the karachi stock exchange KSE has recorded it...,545,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2216,9954056,3,Britain's Jason Gardener enjoyed a double 60m ...,britain s jason gardener enjoyed a double m su...,531,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2222,9986243,0,"The US economy has grown more than expected, e...",the US economy has grown more than expected ex...,197,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
#recode category numbers back into textual labels
recode = {0: 'Business', 1: 'Entertainment', 2: 'Politics', 3: 'Sports', 4: 'Technology'}
df_predict.category = [recode[category] for category in df_predict.category]

In [13]:
df_predict


Unnamed: 0,id,category,raw_text,clean_text,total_words,tfidf_scores
2,14775,Technology,"If you have finished Doom 3, Half Life 2 and H...",if you have finished doom half life and halo d...,499,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,16641,Politics,Controversial new UK casinos will be banned fr...,controversial new UK casinos will be banned fr...,274,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,17511,Sports,Justine Henin-Hardenne lost to Elena Dementiev...,justine henin hardenne lost to elena dementiev...,318,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,19326,Business,The two most senior executives at US mortgage ...,the two most senior executives at US mortgage ...,388,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,19971,Sports,Munster's Heineken Cup quarter-final tie again...,munster s heineken cup quarter final tie again...,148,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
2214,9951082,Entertainment,The Incredibles movie has beaten Shrek 2 to th...,the incredibles movie has beaten shrek to the ...,171,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2215,9952845,Business,The Karachi Stock Exchange (KSE) has recorded ...,the karachi stock exchange KSE has recorded it...,545,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2216,9954056,Sports,Britain's Jason Gardener enjoyed a double 60m ...,britain s jason gardener enjoyed a double m su...,531,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2222,9986243,Business,"The US economy has grown more than expected, e...",the US economy has grown more than expected ex...,197,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
#save id and category values for each prediction to a CSV file
df_predict.to_csv('Jhobalia, Kelly, Jain.csv', columns=['id', 'category'], header=False, index=False) 

