In [41]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import nltk
#nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing #for preprocessing text data
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer (which includes pre-processing, tokenization, and filtering out stop words)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from string import punctuation

In [42]:
df = pd.read_csv("Cancer_Dataset.csv", encoding='latin1')
df.head()

Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


In [43]:
#Dropping the irrelevant column
df = df.drop('Unnamed: 0',axis=1)
# Renaming the column names
df.columns=['Class_Labels', 'Research_Paper_Text']
df

Unnamed: 0,Class_Labels,Research_Paper_Text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...
7565,Colon_Cancer,we report the case of a 24yearold man who pres...
7566,Colon_Cancer,among synchronous colorectal cancers scrcs rep...
7567,Colon_Cancer,the heterogeneity of cancer cells is generally...
7568,Colon_Cancer,"""adipogenesis is the process through which mes..."


In [44]:
#Dropping Null Values
count = df['Class_Labels'].isna().sum()
if  count > 0:
    print(f'Found {count} null values in Class_Labels column')
    #df['Class_Labels'].fillna('missing', inplace=True) # though we could do this, we will drop the rows instead - as there is no way to impute the text
    df = df.dropna(subset=['Class_Labels'])

In [45]:
#Dropping Null Values
count = df['Research_Paper_Text'].isna().sum()
if  count > 0:
    print(f'Found {count} null values in Research_Paper_Text column')
    #df['Research_Paper_Text'].fillna('missing', inplace=True) # though we could do this, we will drop the rows instead - as there is no way to impute the text
    df = df.dropna(subset=['Research_Paper_Text'])

In [46]:
df['Class_Labels'].unique()

array(['Thyroid_Cancer', 'Colon_Cancer', 'Lung_Cancer'], dtype=object)

In [47]:
#checking data imbalance
df['Class_Labels'].value_counts()

Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: Class_Labels, dtype: int64

In [48]:
# Define stopwords list
stopwords_list = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define function for text cleaning and lemmatization
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Cleaning and lemmatization
    clean_lemmatized_text = [lemmatizer.lemmatize(token.lower()) for token in tokens if (token.lower() not in punctuation) and (token.lower() not in stopwords_list) and (len(token) > 2) and token.isalpha()]
    return " ".join(clean_lemmatized_text)

# Apply text preprocessing to the 'Research_Paper_Text' column
df['Research_Paper_Text'] = df['Research_Paper_Text'].apply(preprocess_text)

In [49]:
df.head()

Unnamed: 0,Class_Labels,Research_Paper_Text
0,Thyroid_Cancer,thyroid surgery child single institution osama...
1,Thyroid_Cancer,adopted strategy used prior year based four ex...
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis muta...
3,Thyroid_Cancer,solitary plasmacytoma skull uncommon clinical ...
4,Thyroid_Cancer,study aimed investigate serum matrix metallopr...


In [50]:
X = df['Research_Paper_Text']

In [51]:
y = df['Class_Labels']

In [52]:
le = preprocessing.LabelEncoder()
le.fit(y)
classes = list(enumerate(le.classes_))
print(classes)
y = le.transform(y)
y

[(0, 'Colon_Cancer'), (1, 'Lung_Cancer'), (2, 'Thyroid_Cancer')]


array([2, 2, 2, ..., 0, 0, 0])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [54]:
X_train.shape, y_train.shape

((5299,), (5299,))

In [55]:
X_test.shape, y_test.shape

((2271,), (2271,))

In [56]:
y_train

array([2, 2, 0, ..., 1, 2, 2])

Sklearn: Text preparation
For simplicity (and focus), we will not do any text cleaning or preprocessing. We will just use the raw text as input to the model. See the text mining data prep tutorial for more details on text cleaning and preprocessing.


In [57]:
tfidf_vect = TfidfVectorizer() # see: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

X_train = tfidf_vect.fit_transform(X_train)

In [58]:
X_train.shape

(5299, 146913)

In [59]:
X_train

<5299x146913 sparse matrix of type '<class 'numpy.float64'>'
	with 4618391 stored elements in Compressed Sparse Row format>

In [60]:
print(y_train)

[2 2 0 ... 1 2 2]


In [61]:
# Perform the TfidfVectorizer transformation
# Be careful: We are using the train fit to transform the test data set. Otherwise, the test data 
# features will be very different and match the train set!!!

X_test = tfidf_vect.transform(X_test)

In [62]:
X_train.shape, X_test.shape

((5299, 146913), (2271, 146913))

In [63]:
# These data sets are "sparse matrix". We can't see them unless we convert using toarray()
np.set_printoptions(precision=3)
print(X_train.todense())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Latent Semantic Analysis (Singular Value Decomposition)

In [64]:
svd = TruncatedSVD(n_components=1000, n_iter=10) #n_components is the number of topics, which should be less than the number of features, and number of rows in the matrix

X_train_dim_reduct = svd.fit_transform(X_train)
X_test__dim_reduct = svd.transform(X_test)

In [65]:
X_train.shape, X_test.shape

((5299, 146913), (2271, 146913))

In [66]:
X_train_dim_reduct.shape, X_test__dim_reduct.shape

((5299, 1000), (2271, 1000))

In [67]:
X_train_dim_reduct

array([[ 2.550e-01, -1.925e-01, -2.679e-02, ..., -1.010e-17, -6.370e-19,
        -5.235e-19],
       [ 2.006e-01, -1.350e-01,  5.901e-02, ...,  1.711e-18, -1.891e-18,
         2.799e-19],
       [ 2.569e-01,  1.239e-01,  6.563e-03, ..., -2.301e-18,  3.029e-18,
         6.012e-19],
       ...,
       [ 1.114e-01, -9.912e-02, -5.861e-02, ..., -7.776e-19,  5.557e-19,
         3.608e-19],
       [ 1.721e-01, -1.825e-01, -5.620e-02, ..., -1.857e-18, -9.699e-19,
        -7.729e-19],
       [ 2.244e-01, -2.165e-01, -1.257e-01, ...,  1.225e-18,  3.793e-18,
         1.271e-19]])

In [68]:
df = pd.DataFrame(X_train_dim_reduct, columns=[f"svd{num:04}" for num in range(0,X_train_dim_reduct.shape[1])])
df


Unnamed: 0,svd0000,svd0001,svd0002,svd0003,svd0004,svd0005,svd0006,svd0007,svd0008,svd0009,...,svd0990,svd0991,svd0992,svd0993,svd0994,svd0995,svd0996,svd0997,svd0998,svd0999
0,0.255006,-0.192515,-0.026789,-0.052189,0.050459,-0.048633,-0.119539,0.004981,-0.127498,0.044542,...,4.348667e-18,1.159376e-18,-6.423898e-18,2.270895e-18,-2.930734e-18,5.484538e-18,-8.401085e-18,-1.010341e-17,-6.369688e-19,-5.234664e-19
1,0.200552,-0.134990,0.059012,0.057925,0.115033,-0.067425,-0.171991,0.103636,-0.086807,-0.050263,...,6.112402e-19,2.373439e-18,-4.861969e-19,-1.304219e-18,-4.936085e-19,-7.580945e-19,1.821809e-18,1.711430e-18,-1.891001e-18,2.799444e-19
2,0.256894,0.123914,0.006563,0.017838,0.034257,-0.052455,-0.031224,0.008566,-0.012628,-0.047267,...,-1.706771e-19,3.752991e-18,-2.820620e-19,6.547565e-19,3.769297e-19,-1.221422e-18,-2.630037e-19,-2.300541e-18,3.028566e-18,6.011816e-19
3,0.246648,0.056840,0.283670,-0.106647,-0.161781,0.110875,0.079654,0.109008,-0.068634,0.059235,...,1.351335e-18,3.550868e-18,-4.031877e-19,-1.422380e-18,4.411348e-18,4.220342e-19,-6.747147e-19,2.729564e-19,-1.971893e-18,-1.929541e-18
4,0.341544,-0.062512,-0.105005,-0.003511,-0.095237,-0.058888,0.044454,0.130150,0.081088,-0.035827,...,-2.038385e-18,-1.748488e-18,-1.473837e-18,4.972083e-19,-4.226694e-18,-3.981902e-18,1.897777e-18,-7.480995e-18,8.046813e-19,5.202476e-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5294,0.295400,0.065347,-0.057038,-0.069769,-0.003257,0.027635,-0.057453,0.083789,0.082771,-0.109756,...,-4.193342e-18,7.318365e-19,-3.434719e-18,1.409463e-18,3.184844e-19,-4.797595e-18,-1.036768e-18,-6.234162e-19,3.279712e-18,1.246832e-18
5295,0.130896,-0.106104,-0.007007,0.037442,0.080965,0.007979,-0.080060,0.032742,-0.024281,-0.008455,...,8.576844e-18,4.283869e-18,2.759633e-18,1.078273e-18,7.549605e-18,-1.411157e-18,6.550529e-18,1.101143e-19,-5.895349e-19,4.035265e-18
5296,0.111394,-0.099120,-0.058613,0.014150,-0.001003,-0.030919,0.022894,0.035235,-0.017918,0.067303,...,2.990026e-19,-1.660185e-19,-9.419006e-19,3.165786e-20,-3.879411e-19,-7.843525e-19,4.108110e-19,-7.775762e-19,5.556536e-19,3.608360e-19
5297,0.172087,-0.182485,-0.056198,-0.059959,0.017467,-0.084066,0.050865,-0.027330,-0.021089,0.023624,...,-6.361217e-19,1.892272e-18,-2.281907e-18,4.623106e-18,-1.922765e-18,9.656176e-20,1.492472e-18,-1.856696e-18,-9.698527e-19,-7.729176e-19
