In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import pickle

In [2]:
# As labels are not allowed, we are gonna be using Train Data Set
train_df = pd.read_csv("D:/AI or Data Science Internships/ITIC Internship/Automatic Exam Marking Project/BBC News Classification/BBC News Train.csv")

In [3]:
train_df

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [4]:
# Numerical Encoding or Categorization for Category Column
train_df["Label_Encoding"] = train_df["Category"].factorize()[0]

In [5]:
train_df

Unnamed: 0,ArticleId,Text,Category,Label_Encoding
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,4
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,4
1487,1590,weak dollar hits reuters revenues at media gro...,business,0
1488,1587,apple ipod family expands market apple has exp...,tech,1


In [6]:
# Frequency Distribution for Each Class
print (train_df["Category"].value_counts())
print (train_df["Label_Encoding"].value_counts())


# Based on frequency distribution  we can say that data is balanced, not suffering from class imbalance.

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64
3    346
0    336
2    274
4    273
1    261
Name: Label_Encoding, dtype: int64


In [7]:
# Preserving the Category Coding
category_labels_to_id = {"business":0,"tech":1,"politics":2,"sport":3,"entertainment":4}
id_to_category = {0:"business",1:"tech",2:"politics",3:"sport",4:"entertainment"}

In [8]:
train_df

Unnamed: 0,ArticleId,Text,Category,Label_Encoding
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,4
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,4
1487,1590,weak dollar hits reuters revenues at media gro...,business,0
1488,1587,apple ipod family expands market apple has exp...,tech,1


In [9]:
# Check the number of Null in our Data Set
train_df.isnull().sum()

ArticleId         0
Text              0
Category          0
Label_Encoding    0
dtype: int64

In [10]:
"""
Setting TF-IDF
--------------
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
min_df = Ignore all the words that have a document frequency less than min_df
"""

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=7, norm='l2', encoding='utf-8', ngram_range=(1, 3),lowercase = True,stop_words='english')

In [11]:
# Training the tfidf feature
tfidf_feature = tfidf.fit_transform(train_df.Text).toarray()

In [12]:
with open('news_classification_tfidf_vectorizer', 'wb') as output:
    pickle.dump(tfidf, output)

In [13]:
N = 5  # We are going to look for top 3 categories
labels = train_df.Label_Encoding

#For each category, find words that are highly corelated to it
for category, category_id in sorted(category_labels_to_id.items()):
  features_chi2 = chi2(tfidf_feature, labels == category_id)              # Do chi2 analyses of all items in this category
  indices = np.argsort(features_chi2[0])                                  # Sorts the indices of features_chi2[0] - the chi-squared stats of each feature
  feature_names = np.array(tfidf.get_feature_names())[indices]            # Converts indices to feature names ( in increasing order of chi-squared stat values)
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]         # List of single word features ( in increasing order of chi-squared stat values)
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]          # List for two-word features ( in increasing order of chi-squared stat values)
  trigrams = [v for v in feature_names if len(v.split(" "))==3]
  print("# '{}':".format(category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) # Print 3 unigrams with highest Chi squared stat
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat
  print("  . Most correlated Trigrams:\n       . {}".format('\n       . '.join(trigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat

# 'business':
  . Most correlated unigrams:
       . economy
       . oil
       . growth
       . bank
       . shares
  . Most correlated bigrams:
       . chief executive
       . oil prices
       . stock market
       . economic growth
       . analysts said
  . Most correlated Trigrams:
       . current account deficit
       . pre tax profits
       . chief financial officer
       . high oil prices
       . securities exchange commission
# 'entertainment':
  . Most correlated unigrams:
       . awards
       . album
       . singer
       . actor
       . film
  . Most correlated bigrams:
       . film festival
       . won best
       . best film
       . los angeles
       . box office
  . Most correlated Trigrams:
       . uk singles chart
       . best supporting actress
       . best supporting actor
       . berlin film festival
       . million dollar baby
# 'politics':
  . Most correlated unigrams:
       . tories
       . party
       . blair
       . election
       .

In [14]:
# Train Test Split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()

In [15]:
#Split Data
X_train, X_test, y_train, y_test= train_test_split(tfidf_feature, labels, test_size=0.25, random_state=0)

In [16]:
model.fit(X_train,y_train)


RandomForestClassifier()

In [17]:
with open('news_classification_rf_model', 'wb') as output:
    pickle.dump(model, output)

In [18]:
predicted_train = model.predict(X_train)
predicted_test = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report

In [20]:
print (classification_report(y_test,predicted_test))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        86
           1       0.97      0.91      0.94        67
           2       0.97      0.95      0.96        63
           3       0.94      0.99      0.97        84
           4       0.96      0.95      0.95        73

    accuracy                           0.96       373
   macro avg       0.96      0.95      0.96       373
weighted avg       0.96      0.96      0.96       373



## Module for prediction


In [22]:
test_article = "Iron man actor rober junior came for promotion. The film is getting lot of attention from movie lovers across the globe. Its gonna be interesting to see how this movie performs on box-office."


In [23]:
test_article = input("Enter the text of your article")


Enter the text of your articleCondition of election in Pakistan is not that great. Political parties are trying to influence decision of voters by all illegal means. An independent observer team from UN is must to make sure that fair election happens in the country and democracy is preserved.


In [24]:
test_article = test_article.lower()


In [25]:
test_frame = pd.DataFrame({"Text":[test_article]})
print (test_frame)

                                                Text
0  condition of election in pakistan is not that ...


In [26]:
test_feature = tfidf.transform(test_frame.Text).toarray()


In [27]:
prediction = model.predict(test_feature)


In [28]:
print (prediction)


[3]
