in this section we import some most famous libraries to use in next:

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score



then we read each text separately and add it to a dictionary. in this dictionary, keys are text and values are labels. finally we put them in pandas dataframe :

In [2]:
dictionary = {}

# folder_path = "C:\\Users\\reza\\Downloads\\bbc-fulltext\\bbc"
folder_path = "C:\\Users\\reza\\Downloads\\bbcsport-fulltext\\bbcsport"
list_of_folderName = [list_of_folderName for list_of_folderName in os.listdir(folder_path)]

for label in list_of_folderName:
    file_path = os.path.join(folder_path, label)
    list_of_fileName = [fileName for fileName in os.listdir(file_path)]

    for fileName in list_of_fileName:
        file_name = os.path.join(file_path, fileName)

        with open(file_name, 'r') as f:
            content = f.read()

        dictionary[content] = label

df = pd.DataFrame({'body': dictionary.keys(), 'labels': dictionary.values()})
df

Unnamed: 0,body,labels
0,Claxton hunting first major medal\n\nBritish h...,athletics
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics
2,Greene sets sights on world title\n\nMaurice G...,athletics
3,IAAF launches fight against drugs\n\nThe IAAF ...,athletics
4,"Dibaba breaks 5,000m world record\n\nEthiopia'...",athletics
...,...,...
722,Agassi into second round in Dubai\n\nFourth se...,tennis
723,Mauresmo fights back to win title\n\nWorld num...,tennis
724,Federer wins title in Rotterdam\n\nWorld numbe...,tennis
725,GB players warned over security\n\nBritain's D...,tennis


texts are converted into vectors using the bag of words technique. by calling x.toarray() you can see a matrix of all vectors where each row represents a text :

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["body"])
X.toarray().shape

(727, 13291)

we reduced the dimensions into the 2-dim using the "TruncatedSVD" module then we put the LSA object in a dataframe:

In [4]:
svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(X)


In [5]:
dataSet = pd.DataFrame(lsa,columns=["topic1", "topic2"])
dataSet
                                    

Unnamed: 0,topic1,topic2
0,16.633484,3.035413
1,13.221183,2.700554
2,35.942941,3.771477
3,19.298830,-0.888590
4,14.455030,3.584162
...,...,...
722,18.303850,3.174383
723,17.013097,4.386913
724,16.001999,0.100006
725,24.253168,-2.912380


in this step, labels is added to the datafaram again:

In [6]:
dataSet["label"] = dictionary.values()
dataSet

Unnamed: 0,topic1,topic2,label
0,16.633484,3.035413,athletics
1,13.221183,2.700554,athletics
2,35.942941,3.771477,athletics
3,19.298830,-0.888590,athletics
4,14.455030,3.584162,athletics
...,...,...,...
722,18.303850,3.174383,tennis
723,17.013097,4.386913,tennis
724,16.001999,0.100006,tennis
725,24.253168,-2.912380,tennis


K nearest neighbors for "K=3" and cosine metric is implemented, 70% of data is intended for training and 30% of data for testing:

In [7]:
x = dataSet.drop("label",axis=1)
y = dataSet["label"]
x_trian, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.3, random_state=42)
KNN = KNeighborsClassifier(n_neighbors=3,metric="cosine")
KNN.fit(x_trian,y_train)
y_pred = KNN.predict(x_test)




you can see that for the 2-dim in the best case only 27% of the test data labels matched their actual labels: 

In [8]:
res = accuracy_score(y_test, y_pred)
res1 = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average="micro")

print(f"accuracy_score = {res*100:.4}%")
print(f"balanced_accuracy_score = {res1*100:.4}%")
print(f"f1_score = {f1*100:.4}%")

accuracy_score = 27.85%
balanced_accuracy_score = 24.4%
f1_score = 27.85%


again we reduce the dimensions into the 200-dim and repeat the previous step:

In [9]:
svd = TruncatedSVD(n_components=200)
lsa = svd.fit_transform(X)


In [10]:
svd_dataSet = pd.DataFrame(lsa)
svd_dataSet

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,16.633484,3.035726,-3.278307,1.300609,4.194986,0.260970,-2.921647,1.409582,0.123330,-1.826213,...,-0.565322,-1.145830,-1.000962,0.576262,1.025940,-0.087988,-0.223598,0.751824,0.132151,0.509089
1,13.221183,2.700719,-4.562808,3.100286,1.689267,2.355311,0.018906,-2.161501,1.742633,-1.217348,...,-0.184961,0.009544,0.058456,0.105545,-0.529774,-0.293265,0.247075,1.028415,0.110789,0.515532
2,35.942941,3.771307,-3.952430,9.627592,0.891979,-3.222804,0.656357,4.427466,1.662279,3.126398,...,0.677792,-1.909316,1.735213,-0.363846,0.853856,0.164277,-0.221211,-0.962432,-0.015429,-1.184322
3,19.298830,-0.888912,-1.380318,0.187257,0.024188,-1.968790,-4.325434,0.139896,-1.256179,0.159599,...,0.252074,-0.058688,0.407864,0.730184,-0.179920,0.390430,-0.328701,-0.553961,-0.224227,0.457838
4,14.455030,3.584158,-1.088799,3.651269,0.189910,-2.407173,-1.272715,0.420973,-0.761103,0.004276,...,0.310280,0.197702,0.052881,-0.084799,-0.152068,0.562335,0.540977,0.715787,0.532596,-0.295633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,18.303850,3.174187,-0.973892,1.661455,1.505978,-0.357223,1.368990,0.090733,2.003946,3.789842,...,0.253276,-1.047564,-0.391455,0.349804,0.606924,-0.702551,0.162771,0.565249,0.515621,-1.252447
723,17.013097,4.386875,-3.172195,1.127452,1.580125,-1.467452,-2.559335,0.559974,4.003227,1.645545,...,0.049319,0.508278,-0.365336,0.077596,0.710369,-0.287910,0.811160,0.504168,-0.721651,0.227075
724,16.001999,0.100007,0.434999,3.050495,-0.407764,-2.309906,-0.441360,-0.381062,0.747989,0.485541,...,0.892118,0.186296,0.585002,0.237358,-0.064900,-0.033470,-0.336210,0.242458,-0.264046,-0.473364
725,24.253168,-2.912387,-5.855109,-3.342947,-3.052049,-0.107266,-2.194500,-3.313604,-1.375401,1.231296,...,-0.248604,1.113004,-0.343620,0.306250,-0.021818,0.966685,-0.257764,-0.100386,-0.354798,0.268863


In [11]:
svd_dataSet["label"] = dictionary.values()
svd_dataSet

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,label
0,16.633484,3.035726,-3.278307,1.300609,4.194986,0.260970,-2.921647,1.409582,0.123330,-1.826213,...,-1.145830,-1.000962,0.576262,1.025940,-0.087988,-0.223598,0.751824,0.132151,0.509089,athletics
1,13.221183,2.700719,-4.562808,3.100286,1.689267,2.355311,0.018906,-2.161501,1.742633,-1.217348,...,0.009544,0.058456,0.105545,-0.529774,-0.293265,0.247075,1.028415,0.110789,0.515532,athletics
2,35.942941,3.771307,-3.952430,9.627592,0.891979,-3.222804,0.656357,4.427466,1.662279,3.126398,...,-1.909316,1.735213,-0.363846,0.853856,0.164277,-0.221211,-0.962432,-0.015429,-1.184322,athletics
3,19.298830,-0.888912,-1.380318,0.187257,0.024188,-1.968790,-4.325434,0.139896,-1.256179,0.159599,...,-0.058688,0.407864,0.730184,-0.179920,0.390430,-0.328701,-0.553961,-0.224227,0.457838,athletics
4,14.455030,3.584158,-1.088799,3.651269,0.189910,-2.407173,-1.272715,0.420973,-0.761103,0.004276,...,0.197702,0.052881,-0.084799,-0.152068,0.562335,0.540977,0.715787,0.532596,-0.295633,athletics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,18.303850,3.174187,-0.973892,1.661455,1.505978,-0.357223,1.368990,0.090733,2.003946,3.789842,...,-1.047564,-0.391455,0.349804,0.606924,-0.702551,0.162771,0.565249,0.515621,-1.252447,tennis
723,17.013097,4.386875,-3.172195,1.127452,1.580125,-1.467452,-2.559335,0.559974,4.003227,1.645545,...,0.508278,-0.365336,0.077596,0.710369,-0.287910,0.811160,0.504168,-0.721651,0.227075,tennis
724,16.001999,0.100007,0.434999,3.050495,-0.407764,-2.309906,-0.441360,-0.381062,0.747989,0.485541,...,0.186296,0.585002,0.237358,-0.064900,-0.033470,-0.336210,0.242458,-0.264046,-0.473364,tennis
725,24.253168,-2.912387,-5.855109,-3.342947,-3.052049,-0.107266,-2.194500,-3.313604,-1.375401,1.231296,...,1.113004,-0.343620,0.306250,-0.021818,0.966685,-0.257764,-0.100386,-0.354798,0.268863,tennis


In [12]:
x = svd_dataSet.drop("label",axis=1)
y = svd_dataSet["label"]
x_trian, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.3, random_state=42)
KNN = KNeighborsClassifier(n_neighbors=3,metric="cosine")
KNN.fit(x_trian,y_train)
y_pred = KNN.predict(x_test)




In [13]:
res = accuracy_score(y_test, y_pred)
res1 = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average="micro")

print(f"accuracy_score = {res*100:.4}%")
print(f"balanced_accuracy_score = {res1*100:.4}%")
print(f"f1_score = {f1*100:.4}%")


accuracy_score = 83.56%
balanced_accuracy_score = 85.08%
f1_score = 83.56%


in this section we do the processing with all dimensions :

In [14]:
dataSet_fullDim = pd.DataFrame(X.toarray())
dataSet_fullDim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13281,13282,13283,13284,13285,13286,13287,13288,13289,13290
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
dataSet_fullDim["label"] = dictionary.values()
dataSet_fullDim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13282,13283,13284,13285,13286,13287,13288,13289,13290,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,athletics
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,athletics
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,athletics
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,athletics
4,0,0,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,athletics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,tennis
723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,tennis
724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,tennis
725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,tennis


In [16]:
x = dataSet_fullDim.drop("label",axis=1)
y = dataSet_fullDim["label"]
x_trian, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.3, random_state=42)
KNN = KNeighborsClassifier(n_neighbors=3,metric="cosine")
KNN.fit(x_trian,y_train)
y_pred = KNN.predict(x_test)



In [17]:
res = accuracy_score(y_test, y_pred)
res1 = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average="micro")

print(f"accuracy_score = {res*100:.4}%")
print(f"balanced_accuracy_score = {res1*100:.4}%")
print(f"f1_score = {f1*100:.4}%")

accuracy_score = 82.19%
balanced_accuracy_score = 80.94%
f1_score = 82.19%


we conclude that increasing the number of dimensions from a certain number onwards causes the efficiency to decrease and time complexity to increase(Overfitting).