# Objective : Regression-NLP

# Exploratory Data Analysis (EDA) - Python

# Insights - Patterns

# Regression (Using the ML)

# 1. Load Python Modules

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from tabulate import tabulate

from sklearn.linear_model import LinearRegression,Lasso, Ridge,ElasticNet

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

#from sklearn.metrics import mean_squared_error,root_mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srishanm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srishanm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Read the Dataset from CSV file  - Using Pandas

In [23]:
file_path=r"train.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,breast_cancer_diagnosis_desc,metastatic_diagnosis_period
0,Malignant neoplasm of unspecified site of left...,191
1,Malig neoplasm of upper-outer quadrant of left...,33
2,"Malignant neoplasm of breast (female), unspeci...",157
3,Malignant neoplasm of unsp site of right femal...,146
4,Malignant neoplasm of other specified sites of...,286


# 3. Basic Inspection on given dataset

In [24]:
def basic_inspection_text_dataset(table):
    print("Top 5 Records of dataset")
    print(table.head())
    print()
        
    print("Bottom Records of dataset")
    print(table.tail())
    print()
    
    print("Column/features/Variable  - Names of Given dataset")
    print(table.columns)
    print()
    
    print("Shape(rows x columns) - of Given dataset")
    print(table.shape)
    print()
    
    print("Data types - Given Column Names")
    print(table.dtypes)
    print()
    
    print("Summry of dataset")
    print(table.info())
    print()
    
    print("To see the count of null/nan values in columns of dataset")
    print(table.isnull().value_counts())
    print()
    
    print("Dataset Summary ")
    print(table.describe())
    print()
    
basic_inspection_text_dataset(df)

Top 5 Records of dataset
                        breast_cancer_diagnosis_desc  \
0  Malignant neoplasm of unspecified site of left...   
1  Malig neoplasm of upper-outer quadrant of left...   
2  Malignant neoplasm of breast (female), unspeci...   
3  Malignant neoplasm of unsp site of right femal...   
4  Malignant neoplasm of other specified sites of...   

   metastatic_diagnosis_period  
0                          191  
1                           33  
2                          157  
3                          146  
4                          286  

Bottom Records of dataset
                            breast_cancer_diagnosis_desc  \
13168  Malignant neoplasm of unsp site of right femal...   
13169  Malignant neoplasm of breast (female), unspeci...   
13170  Malignant neoplasm of ovrlp sites of unsp fema...   
13171  Malignant neoplasm of upper-inner quadrant of ...   
13172  Malignant neoplasm of unsp site of right femal...   

       metastatic_diagnosis_period  
13168          

# 4. Dataset - Understanding

In [25]:
df.shape

(13173, 2)

# 5. Create bag of words representation using CountVectorizer

In [26]:
def pre_processing(text):
    words =  word_tokenize(text) 
    eng_stopwords=stopwords.words("english")
    tokens = [word for word in words if word.isalnum()]
    lower_tokens = [word.lower() for word in tokens ]

    no_punctuations_stopwords_tokens = [token for  token in lower_tokens if token not in eng_stopwords]
    return " ".join(no_punctuations_stopwords_tokens)

In [27]:
v = CountVectorizer()
df["processed_Message"]=df["breast_cancer_diagnosis_desc"].apply(pre_processing)
X_Message_cv = v.fit_transform(df["processed_Message"])

# 6. Train test split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_Message_cv, df['metastatic_diagnosis_period'] , test_size=0.2,random_state=42)

In [29]:
X_train.shape

(10538, 24)

In [30]:
X_test.shape

(2635, 24)

In [31]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [32]:
type(y_train)

pandas.core.series.Series

In [33]:
y_train[:4]

5363    212
7974     22
8019    109
9975     34
Name: metastatic_diagnosis_period, dtype: int64

In [34]:
v.get_feature_names_out()

array(['areola', 'axillary', 'breast', 'central', 'female', 'left',
       'male', 'malig', 'malignant', 'neoplasm', 'neoplm', 'nipple',
       'overlapping', 'ovrlp', 'portion', 'quadrant', 'right',
       'secondary', 'site', 'sites', 'specified', 'tail', 'unsp',
       'unspecified'], dtype=object)

In [35]:
v.vocabulary_

{'malignant': 8,
 'neoplasm': 9,
 'unspecified': 23,
 'site': 18,
 'left': 5,
 'female': 4,
 'breast': 2,
 'malig': 7,
 'quadrant': 15,
 'unsp': 22,
 'right': 16,
 'specified': 20,
 'sites': 19,
 'neoplm': 10,
 'ovrlp': 13,
 'nipple': 11,
 'areola': 0,
 'central': 3,
 'portion': 14,
 'secondary': 17,
 'axillary': 1,
 'tail': 21,
 'male': 6,
 'overlapping': 12}

In [36]:
X_train_np = X_train.toarray()
X_train_np[0]

array([0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0], dtype=int64)

In [37]:
np.where(X_train_np[0]!=0)

(array([ 1,  2,  4,  5,  8,  9, 21], dtype=int64),)

## 7. Train  model

In [38]:
def adjusted_r_squared(y_true, y_pred, n_samples, n_features):
    """
    Calculate the adjusted R-squared score.

    Parameters:
    - y_true: array-like, true target values
    - y_pred: array-like, predicted target values
    - n_samples: int, number of samples (observations)
    - n_features: int, number of features (predictors)

    Returns:
    - adjusted R-squared score
    """
    from sklearn.metrics import r2_score

    r_squared = r2_score(y_true, y_pred)
    adjusted_r_squared = 1 - (1 - r_squared) * ((n_samples - 1) / (n_samples - n_features - 1))
    
    return adjusted_r_squared


In [39]:
model_results = {}
def regression_matrix(model ,X_train,X_test,y_test, model_name):
    print("Model Name ",model_name)
    y_pred = model.predict(X_test)
    train_r2_score=round(model.score(X_train,y_train),3)
    print("train R2 Score:",train_r2_score)
    test_r2_score=round(model.score(X_test,y_test),3)
    print("Test R2 Score:",test_r2_score)
    print("Test R2 score:",r2_score(y_test,y_pred))
    
    mse = round(mean_squared_error(y_test,y_pred),3)
    print("MSE:",mse)
    #rmse=round(root_mean_squared_error(y_test,y_pred),3)
    rmse=np.sqrt(mse)
    print("RMSE:",rmse)
    adj_r2_score=round(adjusted_r_squared(y_test,y_pred,len(y_test),X_train.shape[1]),3)
    print("Adj-R Score",adj_r2_score)
    
    if abs(train_r2_score - test_r2_score) > .10:
        print("model :" , model_name ,"is overfitting")
    if train_r2_score < 0.50:
        print("model :" , model_name ,"is underfitting")
    
    model_results[model_name]=[train_r2_score,test_r2_score,adj_r2_score,mse,rmse]
    

 ## Decision Tree Regression

In [40]:
# Create and fit the model
model = DecisionTreeRegressor()
model.fit(X_train,y_train)
print("Model - Decision Tree Regression")

regression_matrix(model ,X_train,X_test,y_test, "DT")

Model - Decision Tree Regression
Model Name  DT
train R2 Score: 0.41
Test R2 Score: 0.438
Test R2 score: 0.4384014062635796
MSE: 6821.237
RMSE: 82.59078035713188
Adj-R Score 0.433
model : DT is underfitting


## Random Forest Regression

In [41]:
# Create and fit the model
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("Model - Random Forest Regression")

regression_matrix(model ,X_train,X_test,y_test, "RandomForest")

Model - Random Forest Regression
Model Name  RandomForest
train R2 Score: 0.41
Test R2 Score: 0.439
Test R2 score: 0.43856305327629974
MSE: 6819.273
RMSE: 82.57888955417117
Adj-R Score 0.433
model : RandomForest is underfitting


## AdaBoost Regression

In [42]:
# Create and fit the model
ada_boost = AdaBoostRegressor()
ada_boost.fit(X_train, y_train)
print("Model - AdaBoost Regression ")

regression_matrix(ada_boost ,X_train,X_test,y_test, "AdaBoost")

Model - AdaBoost Regression 
Model Name  AdaBoost
train R2 Score: 0.364
Test R2 Score: 0.385
Test R2 score: 0.38528457231137303
MSE: 7466.399
RMSE: 86.40832714501538
Adj-R Score 0.38
model : AdaBoost is underfitting


## Gradient Boosting Regression

In [43]:
# Create and fit the model
gradient_boost = GradientBoostingRegressor()
gradient_boost.fit(X_train, y_train)
print("Model - Gradient Boosting Regression")

regression_matrix(gradient_boost ,X_train,X_test,y_test, "GradientBoost")

Model - Gradient Boosting Regression
Model Name  GradientBoost
train R2 Score: 0.408
Test R2 Score: 0.436
Test R2 score: 0.4362359792584193
MSE: 6847.538
RMSE: 82.7498519636138
Adj-R Score 0.431
model : GradientBoost is underfitting


##  XGBoost Regression

In [44]:
# Create and fit the model
xg_boost = xgb.XGBRegressor()
xg_boost.fit(X_train, y_train)
print("Model-XGBoost Regression")

regression_matrix(xg_boost ,X_train,X_test,y_test, "XGB")

Model-XGBoost Regression
Model Name  XGB
train R2 Score: 0.41
Test R2 Score: 0.438
Test R2 score: 0.43840132964472756
MSE: 6821.238
RMSE: 82.59078641107615
Adj-R Score 0.433
model : XGB is underfitting


##  Summary

In [45]:
print("\n\n")
result=pd.DataFrame(model_results,index=["Train R2","Test R2" ,"Adj R2","MSE","RMSE"])
print(result)
print("\n\n")

print(tabulate(result, headers='keys', tablefmt='pretty'))




                  DT  RandomForest     AdaBoost  GradientBoost          XGB
Train R2     0.41000       0.41000     0.364000       0.408000     0.410000
Test R2      0.43800       0.43900     0.385000       0.436000     0.438000
Adj R2       0.43300       0.43300     0.380000       0.431000     0.433000
MSE       6821.23700    6819.27300  7466.399000    6847.538000  6821.238000
RMSE        82.59078      82.57889    86.408327      82.749852    82.590786



+----------+-------------------+-------------------+-------------------+------------------+-------------------+
|          |        DT         |   RandomForest    |     AdaBoost      |  GradientBoost   |        XGB        |
+----------+-------------------+-------------------+-------------------+------------------+-------------------+
| Train R2 |       0.41        |       0.41        |       0.364       |      0.408       |       0.41        |
| Test R2  |       0.438       |       0.439       |       0.385       |      0.436       