# Importing necessory libraries

In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,StackingClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn import model_selection

# Loading data

In [2]:
articles=load_files('bbc',encoding='utf-8',decode_error='replace')
X=articles.data
y=articles.target

# Initializing TFIDF vectorizer

In [3]:
my_stop_words = set(stopwords.words('english'))

vectorizer = TfidfVectorizer(norm=None,stop_words=my_stop_words,max_features=1000, decode_error="ignore")

# Voting Classifier

In [4]:
# Train-test split into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=74)

# finding 1000 dimensional vectors for training and testing data 
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# Initialising different esmtimators
clf1 = LogisticRegression(multi_class='multinomial',solver='lbfgs', random_state=30,max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=1000, max_depth=100,random_state=1)
clf3 = MultinomialNB()

res=[]
for i in ['soft','hard']:
    #initialising voting classifier
    eclf1 = VotingClassifier(estimators=[ ('lg',clf1),('rf', clf2), ('gnb', clf3)], voting=i)
    
    #fitting data into voting classifier
    eclf1 = eclf1.fit(X_train_vectors, y_train)
    
    #predicting for test data
    labels1=eclf1.predict(X_test_vectors)
    
    #printing performance metrics : Accuracy
    print('Voting:',i)
    print('Accuracy :',accuracy_score(y_test, labels1),'\n\n')
    res.append(accuracy_score(y_test, labels1))

Voting: soft
Accuracy : 0.9775449101796407 


Voting: hard
Accuracy : 0.9880239520958084 




# Stacking Classifier

In [5]:
# Train-test split into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=93)

# finding 1000 dimensional vectors for training and testing data
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# Initialising different esmtimators
base_learners = [
                 ('rf_1', RandomForestClassifier(n_estimators=100, random_state=42)),
                 #('rf_2', make_pipeline(StandardScaler(),LinearSVC(random_state=42))),
                 ('rf_3',MultinomialNB())
                  ]

#initialising voting classifier
clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(multi_class='multinomial', random_state=30,max_iter=1000))

# fitting ,predicting and calculating accuracy
accuracy=clf.fit(X_train_vectors.todense(), y_train).score(X_test_vectors.todense(), y_test)
print('Accuracy : ',accuracy)
res.append(accuracy)

Accuracy :  0.9835329341317365


# Summary

In [6]:
print('******************* Best Accuracies *******************')
print('\n => For Voting Classifier\n')
print('\t -> for soft voting',res[0])
print('\t -> for hard voting',res[1])

print('\n => For Stacking Classifier',res[2])


******************* Best Accuracies *******************

 => For Voting Classifier

	 -> for soft voting 0.9775449101796407
	 -> for hard voting 0.9880239520958084

 => For Stacking Classifier 0.9835329341317365


# Observations

1. Two types of Ensemble classifiers are used in this assignment i.e. Voting and Stacking.
2. It is a tricky business because we are using multiple classifiers to overcome one classifier's limitations by using other classifiers.
3. Multiple permutations were tried in this assignment to achieve the best result with different hyperparameters.
4. Some of the classifiers tried were Logistic regression,SVM,KMeans,Multinomial gaussian,Random forest,XGBoost etc.
5. For voting classifier logistic regression,Random forest and multinomial gaussian shows the best result of 0.988
6. For stacking classifier a stack of random forest followed by multinomial gaussian with a logistic regression as meta classifier gave the best performance of .9835 
7. Appendix section contains the codes for finding an optimal train-test split for the above mentioned best combination of classifiers.

# Appendix

## Code for Voting classifier to find the best possible split 

In [8]:
d={}
d['soft']=[]
d['hard']=[]
for x in range(1,200):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=x)

    my_stop_words = set(stopwords.words('english'))

    vectorizer = TfidfVectorizer(norm=None,stop_words=my_stop_words,max_features=1000, decode_error="ignore")

    X_train_vectors = vectorizer.fit_transform(X_train)
    X_test_vectors = vectorizer.transform(X_test)

    clf1 = LogisticRegression(multi_class='multinomial',solver='lbfgs', random_state=30,max_iter=1000)
    clf2 = RandomForestClassifier(n_estimators=1000, max_depth=100,random_state=1)
    clf3 = MultinomialNB()

    for i in ['soft','hard']:
        eclf1 = VotingClassifier(estimators=[ ('lg',clf1),('rf', clf2), ('gnb', clf3)], voting=i)
        eclf1 = eclf1.fit(X_train_vectors, y_train)
        labels1=eclf1.predict(X_test_vectors)
        print('Random State:',x,'Voting:',i,'Accuracy :',accuracy_score(y_test, labels1))
        d[i].append((x,accuracy_score(y_test, labels1)))
    

print('\nBest Results')
h=max(d['hard'],key=lambda x:x[1])
s=max(d['soft'],key=lambda x:x[1])
print('Random State:',h[0],'Voting:hard','Accuracy :',h[1])
print('Random State:',s[0],'Voting:soft','Accuracy :',s[1])

Random State: 1 Voting: soft Accuracy : 0.9670658682634731
Random State: 1 Voting: hard Accuracy : 0.9670658682634731
Random State: 2 Voting: soft Accuracy : 0.9745508982035929
Random State: 2 Voting: hard Accuracy : 0.9790419161676647
Random State: 3 Voting: soft Accuracy : 0.9715568862275449
Random State: 3 Voting: hard Accuracy : 0.9790419161676647
Random State: 4 Voting: soft Accuracy : 0.9700598802395209
Random State: 4 Voting: hard Accuracy : 0.9745508982035929
Random State: 5 Voting: soft Accuracy : 0.9595808383233533
Random State: 5 Voting: hard Accuracy : 0.9670658682634731
Random State: 6 Voting: soft Accuracy : 0.9565868263473054
Random State: 6 Voting: hard Accuracy : 0.9655688622754491
Random State: 7 Voting: soft Accuracy : 0.9655688622754491
Random State: 7 Voting: hard Accuracy : 0.9610778443113772
Random State: 8 Voting: soft Accuracy : 0.9760479041916168
Random State: 8 Voting: hard Accuracy : 0.9805389221556886
Random State: 9 Voting: soft Accuracy : 0.98053892215568

Random State: 69 Voting: hard Accuracy : 0.9730538922155688
Random State: 70 Voting: soft Accuracy : 0.968562874251497
Random State: 70 Voting: hard Accuracy : 0.9760479041916168
Random State: 71 Voting: soft Accuracy : 0.968562874251497
Random State: 71 Voting: hard Accuracy : 0.9745508982035929
Random State: 72 Voting: soft Accuracy : 0.9610778443113772
Random State: 72 Voting: hard Accuracy : 0.9655688622754491
Random State: 73 Voting: soft Accuracy : 0.9610778443113772
Random State: 73 Voting: hard Accuracy : 0.9700598802395209
Random State: 74 Voting: soft Accuracy : 0.9775449101796407
Random State: 74 Voting: hard Accuracy : 0.9880239520958084
Random State: 75 Voting: soft Accuracy : 0.9670658682634731
Random State: 75 Voting: hard Accuracy : 0.9700598802395209
Random State: 76 Voting: soft Accuracy : 0.9730538922155688
Random State: 76 Voting: hard Accuracy : 0.9775449101796407
Random State: 77 Voting: soft Accuracy : 0.9625748502994012
Random State: 77 Voting: hard Accuracy : 0

Random State: 137 Voting: hard Accuracy : 0.9775449101796407
Random State: 138 Voting: soft Accuracy : 0.9775449101796407
Random State: 138 Voting: hard Accuracy : 0.9775449101796407
Random State: 139 Voting: soft Accuracy : 0.9715568862275449
Random State: 139 Voting: hard Accuracy : 0.9790419161676647
Random State: 140 Voting: soft Accuracy : 0.9640718562874252
Random State: 140 Voting: hard Accuracy : 0.9670658682634731
Random State: 141 Voting: soft Accuracy : 0.9715568862275449
Random State: 141 Voting: hard Accuracy : 0.9730538922155688
Random State: 142 Voting: soft Accuracy : 0.9640718562874252
Random State: 142 Voting: hard Accuracy : 0.9640718562874252
Random State: 143 Voting: soft Accuracy : 0.9640718562874252
Random State: 143 Voting: hard Accuracy : 0.9655688622754491
Random State: 144 Voting: soft Accuracy : 0.9745508982035929
Random State: 144 Voting: hard Accuracy : 0.9775449101796407
Random State: 145 Voting: soft Accuracy : 0.9745508982035929
Random State: 145 Voting

## Code for stacking classifier to find the best possible split 

In [9]:
d1=[]
for x in range(1,201):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=x)

    my_stop_words = set(stopwords.words('english'))

    vectorizer = TfidfVectorizer(norm=None,stop_words=my_stop_words,max_features=1000, decode_error="ignore")

    X_train_vectors = vectorizer.fit_transform(X_train)
    X_test_vectors = vectorizer.transform(X_test)

    base_learners = [
                 ('rf_1', RandomForestClassifier(n_estimators=100, random_state=42)),
                 #('rf_2', make_pipeline(StandardScaler(),LinearSVC(random_state=42))),
                 ('rf_3',MultinomialNB())
                  ]

    clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(multi_class='multinomial', random_state=30,max_iter=1000))
    a=clf.fit(X_train_vectors.todense(), y_train).score(X_test_vectors.todense(), y_test)
    d1.append((x,a))
    print('Random State:',x,'Accuracy :',a)

print('\nBest Results')
mx=max(d1,key=lambda x:x[1])
print('Random State:',mx[0],'Accuracy :',mx[1])

Random State: 1 Accuracy : 0.9640718562874252
Random State: 2 Accuracy : 0.9670658682634731
Random State: 3 Accuracy : 0.968562874251497
Random State: 4 Accuracy : 0.9595808383233533
Random State: 5 Accuracy : 0.9595808383233533
Random State: 6 Accuracy : 0.9505988023952096
Random State: 7 Accuracy : 0.9520958083832335
Random State: 8 Accuracy : 0.968562874251497
Random State: 9 Accuracy : 0.9745508982035929
Random State: 10 Accuracy : 0.9610778443113772
Random State: 11 Accuracy : 0.9640718562874252
Random State: 12 Accuracy : 0.9715568862275449
Random State: 13 Accuracy : 0.9595808383233533
Random State: 14 Accuracy : 0.9640718562874252
Random State: 15 Accuracy : 0.9670658682634731
Random State: 16 Accuracy : 0.9640718562874252
Random State: 17 Accuracy : 0.9565868263473054
Random State: 18 Accuracy : 0.9715568862275449
Random State: 19 Accuracy : 0.9595808383233533
Random State: 20 Accuracy : 0.9625748502994012
Random State: 21 Accuracy : 0.9760479041916168
Random State: 22 Accurac

Random State: 175 Accuracy : 0.968562874251497
Random State: 176 Accuracy : 0.9610778443113772
Random State: 177 Accuracy : 0.968562874251497
Random State: 178 Accuracy : 0.9700598802395209
Random State: 179 Accuracy : 0.9595808383233533
Random State: 180 Accuracy : 0.9595808383233533
Random State: 181 Accuracy : 0.9700598802395209
Random State: 182 Accuracy : 0.9760479041916168
Random State: 183 Accuracy : 0.968562874251497
Random State: 184 Accuracy : 0.9670658682634731
Random State: 185 Accuracy : 0.9610778443113772
Random State: 186 Accuracy : 0.9625748502994012
Random State: 187 Accuracy : 0.9700598802395209
Random State: 188 Accuracy : 0.968562874251497
Random State: 189 Accuracy : 0.9640718562874252
Random State: 190 Accuracy : 0.9700598802395209
Random State: 191 Accuracy : 0.9655688622754491
Random State: 192 Accuracy : 0.9745508982035929
Random State: 193 Accuracy : 0.9610778443113772
Random State: 194 Accuracy : 0.9595808383233533
Random State: 195 Accuracy : 0.9655688622754