<a href="https://colab.research.google.com/github/Swathi1309/Data_Analytics/blob/main/Predicting%20Cancer%20Rates%20using%20a%20Linear%20Regression%20Model/Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the required libraries

In [33]:
import pandas as pd
import numpy as np

import random
import re

import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.linear_model as linear_model
from sklearn.pipeline import make_pipeline

# Functions for data cleaning and visualization

In [2]:
# Functions for data cleaning
# To replace a given value in a column with random values in a given range
def replace_random(data, column, value, low, high):
  for i in range(len(data[column])):
    if data.loc[i, column] == value:
      data.loc[i, column] = random.randint(low, high)
# To remove a regex pattern from the data
def remove_pattern(data, column, pattern):
  for i in range(len(data[column])):
    new = re.sub(pattern, "", str(data.loc[i, column]))
    data.loc[i, column] = float(new)

# Function to plot and save a scatter plot
def plot_scatter(data, x, y, c, name):
  fig, ax = plt.subplots(figsize=(12,12))
  plot = data.plot.scatter(y=y, x=x, c=c,cmap='inferno', ax=ax);
  figure = plot.get_figure()
  figure.savefig(name, bbox_inches='tight')

# Data Cleaning

In [None]:
# Input data
data = pd.read_excel("merged_data.xlsx")
data.info()

In [4]:
# Data Cleaning
to_drop = ['Unnamed: 0', 'State', 'AreaName', 'fips_x', 'fips_y']
data.drop(columns=to_drop, inplace=True)

data.replace (to_replace="stable", value=0, inplace = True)
data.replace (to_replace="falling", value=-1, inplace=True)
data.replace (to_replace="rising", value=1, inplace=True)
data.replace (to_replace="_", value=np.nan, inplace = True)
data.replace (to_replace="__", value=np.nan, inplace = True)

data.replace (to_replace = "*", value = np.nan, inplace=True)
replace_random(data, 'Avg_Ann_Incidence', '3 or fewer', 0, 3)
remove_pattern(data, 'Incidence_Rate', "#$")

data.set_index('FIPS')

data.rename(columns = {'Hispanic':'Med_Income_Hispanic'}, inplace=True)

# Normalizing values to the total population
data['Poverty_Rate'] = data['All_Poverty']/ (data['All_With'] + data['All_Without'])
data['M_Poverty_Rate'] = data['M_Poverty']/ (data['M_With'] + data['M_Without'])
data['F_Poverty_Rate'] = data['F_Poverty']/ (data['F_With'] + data['F_Without'])
data['M_Ins_Rate'] = data['M_With']/ (data['M_With'] + data['M_Without'])
data['F_Ins_Rate'] = data['F_With']/ (data['F_With'] + data['F_Without'])
data['All_Ins_Rate'] = data['All_With']/ (data['All_With'] + data['All_Without'])
data['Total_Population'] = data['All_With'] + data['All_Without']

# Dropping redundant columns
to_drop = ['All_Poverty', 'M_Poverty', 'F_Poverty', 'M_With', 'M_Without', 'F_With', 'F_Without', 'All_With', 'All_Without', 'Avg_Ann_Incidence', 'Avg_Ann_Deaths']
data.drop(columns=to_drop, inplace=True)

In [None]:
# Visualizing missing values
# Missing data matrix
matrix = msno.matrix(data);
matrix_copy = matrix.get_figure()
matrix_copy.savefig("Missing_data_matrix", bbox_inches='tight')
# Heatmap of correlation between missing values
heatmap = msno.heatmap(data);
heatmap_copy = heatmap.get_figure()
heatmap_copy.savefig("Missing_data_heatmap", bbox_inches='tight')

In [6]:
# Dropping rows with missing incidence or mortality rates
data.dropna(subset=['Incidence_Rate', 'Mortality_Rate'], inplace=True)
# Replacing missing data with median, only for Med_Income
med_income = data['Med_Income'].median()
data['Med_Income'].fillna(value = med_income, inplace=True)
# Coverting all data to numeric types (int or float)
for column in data:
  pd.to_numeric(data[column])

# Data Visualization

In [7]:
# Downloading description of the data
description = data.describe()
description.to_excel("Data_Description.xlsx")

In [None]:
# Downloading correlation matrix
fig, ax = plt.subplots(figsize=(15,15))
corr = sb.heatmap(data.corr(), cmap="YlGnBu", annot=True, ax=ax);
corr_copy = corr.get_figure()
corr_copy.savefig('Correlation_heatmap',bbox_inches='tight')

# Plotting distribution of income by race
fig, ax = plt.subplots(figsize=(10,10))
income_plot = sb.boxplot(data=data[['Med_Income', 'Med_Income_White', 'Med_Income_Black', 'Med_Income_Nat_Am', 'Med_Income_Asian', 'Med_Income_Hispanic']])
income_copy = income_plot.get_figure()
income_copy.savefig('Income_vs_Race', bbox_inches='tight')

# Plotting poverty rates
fig, ax = plt.subplots(figsize=(10,10))
rates_plot = sb.boxplot(data=data[['Poverty_Rate', 'M_Poverty_Rate', 'F_Poverty_Rate', 'All_Ins_Rate', 'M_Ins_Rate', 'F_Ins_Rate']])
rates_copy = rates_plot.get_figure()
rates_copy.savefig('Poverty_rates', bbox_inches = 'tight')

# Plotting incidence and mortality rates
fig, ax = plt.subplots(figsize=(10,10))
rates_plot = sb.boxplot(data=data[['Incidence_Rate', 'Mortality_Rate']])
rates_copy = rates_plot.get_figure()
rates_copy.savefig('Lung_Cancer_rates', bbox_inches = 'tight')

In [None]:
# Exploring the relationship between incidence, mortality, and recent trend
plot_scatter(data, "Mortality_Rate", "Incidence_Rate", "recent_trend", "Incidence_vs_Mortality")

# Income vs Incidence plot for each race
plot_scatter(data, "Med_Income", "Incidence_Rate", "Mortality_Rate", 'Income_vs_Incidence')
plot_scatter(data, "Med_Income_White", "Incidence_Rate", "Mortality_Rate", 'White_Income_vs_Incidence')
plot_scatter(data, "Med_Income_Black", "Incidence_Rate", "Mortality_Rate", 'Black_Income_vs_Incidence')
plot_scatter(data, "Med_Income_Nat_Am", "Incidence_Rate", "Mortality_Rate", 'Nat_Am_Income_vs_Incidence')
plot_scatter(data, "Med_Income_Asian", "Incidence_Rate", "Mortality_Rate", 'Asian_Income_vs_Incidence')
plot_scatter(data, "Med_Income_Hispanic", "Incidence_Rate", "Mortality_Rate", 'Hispanic_Income_vs_Incidence')

# Relationships between poverty and incidence
plot_scatter(data, "Poverty_Rate", "Incidence_Rate", "Mortality_Rate", "Poverty_vs_Incidence")
plot_scatter(data, "M_Poverty_Rate", "Incidence_Rate", "Mortality_Rate", "Male_Poverty_vs_Incidence")
plot_scatter(data, "F_Poverty_Rate", "Incidence_Rate", "Mortality_Rate", "Female_Poverty_vs_Incidence")

# Relationship between insurance rates and incidence
# Insurance rate
plot_scatter(data, "All_Ins_Rate", "Incidence_Rate", "Mortality_Rate", "Insurance_vs_Incidence")
plot_scatter(data, "M_Ins_Rate", "Incidence_Rate", "Mortality_Rate", "Male_Insurance_vs_Incidence")
plot_scatter(data, "F_Ins_Rate", "Incidence_Rate", "Mortality_Rate", "Female_Insurance_vs_Incidence")

In [None]:
# Pairplot between uncorrelated features
plt = sb.pairplot(data[['Med_Income', 'Poverty_Rate', 'All_Ins_Rate', 'Mortality_Rate', 'Incidence_Rate']]);
plt.savefig("pairplot.png")

# Comparing various models to find the best fit

In [36]:
# Function to compare various models for a given set of input variables and target variable
def compare_models(X, Y):
  x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size =0.2, random_state=19)
  x_train_log = np.log(x_train)
  x_test_log = np.log(x_test)

  # Linear Regression
  print ("Linear Regression:")
  model = make_pipeline(StandardScaler(), LinearRegression())
  model.fit(x_train, y_train)
  print ("Score:", model.score(x_test, y_test))
  print ("\n")

  # Linear Regression with Log Transformed Values
  print ("Linear Regression (Log Transformed Values):")
  model = make_pipeline(StandardScaler(), LinearRegression())
  model.fit(x_train_log, y_train)
  print ("Score:", model.score(x_test_log, y_test))
  print ("\n")

  # Polynomial Regression
  print ("Polynomial Regression:")
  poly = PolynomialFeatures(degree=2)
  model = make_pipeline(StandardScaler(), LinearRegression())
  model.fit(poly.fit_transform(x_train_log), y_train)
  print ("Score:", model.score(poly.fit_transform(x_test_log), y_test))
  print ("\n")

  # Polynomial Regression with Log Transformed Data
  print ("Polynomial Regression (Log Transformed Values):")
  poly = PolynomialFeatures(degree=2)
  model = make_pipeline(StandardScaler(), LinearRegression())
  model.fit(poly.fit_transform(x_train_log), y_train)
  print ("Score:", model.score(poly.fit_transform(x_test_log), y_test))
  print ("\n")

  # Polynomial Regression with Lasso Regularization
  print ("Lasso Regularized Polynomial Regression:")
  poly = PolynomialFeatures(degree=2)
  model = make_pipeline(StandardScaler(), linear_model.Lasso(alpha=0.1))
  model.fit(poly.fit_transform(x_train_log), y_train)
  print ("Score:", model.score(poly.fit_transform(x_test_log), y_test))
  print ("\n")

  # Polynomial Regression with Ridge Regularization
  print ("Ridge Regularized Polynomial Regression:")
  poly = PolynomialFeatures(degree=2)
  model = make_pipeline(StandardScaler(), linear_model.Ridge(alpha=0.1))
  model.fit(poly.fit_transform(x_train_log), y_train)
  print ("Score:", model.score(poly.fit_transform(x_test_log), y_test))
  print ("\n")

In [37]:
compare_models(data[['Poverty_Rate','Med_Income','All_Ins_Rate']], data[['Incidence_Rate']])

Linear Regression:
Score: 0.21226000694992586


Linear Regression (Log Transformed Values):
Score: 0.22784843694522994


Polynomial Regression:
Score: 0.25986150179151124


Polynomial Regression (Log Transformed Values):
Score: 0.25986150179151124


Lasso Regularized Polynomial Regression:
Score: 0.22424791265126975


Ridge Regularized Polynomial Regression:
Score: 0.24551860495186484




In [38]:
compare_models(data[['Poverty_Rate','Med_Income','All_Ins_Rate']], data[['Mortality_Rate']])

Linear Regression:
Score: 0.2642834322328188


Linear Regression (Log Transformed Values):
Score: 0.28222820616264843


Polynomial Regression:
Score: 0.308068499518368


Polynomial Regression (Log Transformed Values):
Score: 0.308068499518368


Lasso Regularized Polynomial Regression:
Score: 0.27888310797161264


Ridge Regularized Polynomial Regression:
Score: 0.29506278871120695




In [39]:
compare_models(data[['Poverty_Rate','Med_Income','M_Poverty_Rate','F_Poverty_Rate','M_Ins_Rate','F_Ins_Rate','All_Ins_Rate']], data[['Incidence_Rate']])

Linear Regression:
Score: 0.21232284636821686


Linear Regression (Log Transformed Values):
Score: 0.22926459260154075


Polynomial Regression:
Score: 0.2813778778565972


Polynomial Regression (Log Transformed Values):
Score: 0.2813778778565972


Lasso Regularized Polynomial Regression:
Score: 0.23362049214526404


Ridge Regularized Polynomial Regression:
Score: 0.25600353442399215




In [40]:
compare_models(data[['Poverty_Rate','Med_Income','M_Poverty_Rate','F_Poverty_Rate','M_Ins_Rate','F_Ins_Rate','All_Ins_Rate']], data[['Mortality_Rate']])

Linear Regression:
Score: 0.27035254381033225


Linear Regression (Log Transformed Values):
Score: 0.2916595901818587


Polynomial Regression:
Score: 0.3169247098586083


Polynomial Regression (Log Transformed Values):
Score: 0.3169247098586083


Lasso Regularized Polynomial Regression:
Score: 0.28871679670098804


Ridge Regularized Polynomial Regression:
Score: 0.31133072699380937


