In [79]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.externals import joblib



import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

#### 

In [53]:
path = r'dataset.csv'

In [54]:
# reading file to pandas dataframe
df = pd.read_csv(path, encoding="ISO-8859-1")

In [55]:
x = df['news'].tolist()
y = df['type'].tolist()

In [105]:
def data_cleaning(text):
    
    # split into sentences
    words  = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    
    # remove stop words in sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    #print(words[:100])
    
	#please it comment if you don't want to use Lemmatizer
    # lemmatizing of words 
    lmtzr = WordNetLemmatizer()
    words = [lmtzr.lemmatize(word) for word in words]
    # print(lemmt[:100])

    # stemming of words
    porter = PorterStemmer()
    words = [porter.stem(word) for word in words]
    return (" ".join(str(x) for x in words))

In [106]:
for index, string in enumerate(x):
    x[index] = data_cleaning(string)

In [107]:
x[0]

'china role yuko china lent russia help russian govern renationali key yuganskneftega unit oil group yuko reveal kremlin said tuesday russian state bank veb lent rosneft help buy yugansk turn came chine bank revel came russian govern said rosneft sign oil suppli deal china deal see rosneft receiv credit china cnpc accord russian newspap vedomosti credit would use pay loan rosneft receiv financ purcha yugansk report said cnpc offer yugansk return provid financ compani opt oil suppli deal instead analyst said one factor might influenc chine deci possibl litig yuko yugansk former owner cnpc becom sharehold rosneft veb declin comment two compani rosneft cnpc agr deliveri said russian oil offici sergei oganesyan noth unusu five six year announc help explain rosneft indebt rel unknown firm abl financ surpri purcha yugansk yugansk sold auction last year help yuko pay part bill unpaid tax fine embattl russian oil giant previou file bankruptci protect US court attempt prevent forc sale main pro

In [108]:
tf_vec = TfidfVectorizer(stop_words='english', min_df=2)
X = tf_vec.fit_transform(x)
Y = np.array(y)

In [109]:
"No of features",X.shape[1]

('No of features', 11035)

In [110]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20, random_state=42)

In [111]:
model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)

In [112]:
model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [113]:
y_pred = model.predict(X_test)

In [114]:
con_mat = confusion_matrix(Y_test,y_pred)

In [115]:
kappa_coef = cohen_kappa_score(Y_test,y_pred)

In [116]:
accu = accuracy_score(Y_test,y_pred)

In [117]:
print("Confusion Matrix:\n", con_mat)
print( "\nKappa: ",kappa_coef)
print("\nAccuracy: ",accu)

Confusion Matrix:
 [[106   1   5   1   2]
 [  2  67   1   1   1]
 [  3   0  73   0   0]
 [  0   0   0 102   0]
 [  3   1   1   2  73]]

Kappa:  0.9319636884854277

Accuracy:  0.946067415730337


In [118]:
# Saving model to local for future use 
joblib.dump(model, 'news_classification_system.pkl')

joblib.dump(tf_vec, 'tf_vectorizer.pkl')

['tf_vectorizer.pkl']

#### Testing on a String of politics

In [119]:
news_content = """
Hewitt decries 'career sexism'

Plans to extend paid maternity leave beyond six months should be prominent in Labour's election manifesto, the Trade and Industry Secretary has said.

Patricia Hewitt said the cost of the proposals was being evaluated, but it was an "increasingly high priority" and a "shared goal across government". Ms Hewitt was speaking at a gender and productivity seminar organised by the Equal Opportunities Commission (EOC). Mothers can currently take up to six months' paid leave - and six unpaid. Ms Hewitt told the seminar: "Clearly, one of the things we need to do in the future is to extend the period of payment for maternity leave beyond the first six months into the second six months. "We are looking at how quickly we can do that, because obviously there are cost implications because the taxpayer reimburses the employers for the cost of that."

Ms Hewitt also announced a new drive to help women who want to work in male dominated sectors, saying sexism at work was still preventing women reaching their full potential. Plans include funding for universities to help female science and engineering graduates find jobs and "taster courses" for men and women in non-traditional jobs. Women in full-time work earn 19% less than men, according to the Equal Opportunities Commission (EOC).

The minister told delegates that getting rid of "career sexism" was vital to closing the gender pay gap.

"Career sexism limits opportunities for women of all ages and prevents them from achieving their full potential. "It is simply wrong to assume someone cannot do a job on the grounds of their sex," she said. Earlier, she told BBC Radio 4's Today programme: "What we are talking about here is the fact that about six out of 20 women work in jobs that are low-paid and typically dominated by women, so we have got very segregated employment. "Unfortunately, in some cases, this reflects very old-fashioned and stereotypical ideas about the appropriate jobs for women, or indeed for men. "Career sexism is about saying that engineering, for instance, where only 10% of employees are women, is really a male-dominated industry. Construction is even worse. "But it is also about saying childcare jobs are really there for women and not suitable for men. Career sexism goes both ways."

She added that while progress had been made, there was still a gap in pay figures. "The average woman working full-time is being paid about 80p for every pound a man is earning. For women working part-time it is 60p." The Department for Trade and Industry will also provide funding to help a new pay experts panel run by the TUC.

It has been set up to advise hundreds of companies on equal wage policies. Research conducted by the EOC last year revealed that many Britons believe the pay gap between men and women is the result of "natural differences" between the sexes. Women hold less than 10% of the top positions in FTSE 100 companies, the police, the judiciary and trade unions, according to their figures. And retired women have just over half the income of their male counterparts on average.

"""

In [125]:
news_content_clean= [] 
news_content_clean.append(data_cleaning(news_content))
news_content_clean[0]

'hewitt decri plan extend paid matern leav beyond six month promin labour elect manifesto trade industri secretari said patricia hewitt said cost propos evalu increasingli high prioriti share goal across govern Ms hewitt speak gender product seminar organis equal opportun commiss eoc mother current take six month paid leav six unpaid Ms hewitt told seminar clearli one thing need futur extend period payment matern leav beyond first six month second six month We look quickli obvious cost implic taxpay reimburs employ cost Ms hewitt also announc new drive help woman want work male domin sector say sexism work still prevent woman reach full potenti plan includ fund univers help femal scienc engin graduat find job taster cours men woman job women work earn le men accord equal opportun commiss eoc the minist told deleg get rid career sexism vital close gender pay gap career sexism limit opportun woman age prevent achiev full potenti It simpli wrong assum someon job ground sex said earlier to

In [126]:
tf_load_vec = joblib.load('tf_vectorizer.pkl')
model = joblib.load('news_classification_system.pkl')

In [127]:
extract = tf_load_vec.transform(news_content_clean)

In [128]:
model.predict(extract)

array(['politics'], dtype='<U13')