<a href="https://colab.research.google.com/github/NishithaThimmappa/Coursera_projects/blob/main/DataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Objective:** 

Classfication Problem- Classify whether the given cell is a malignant or benign.



*   1= Malignant (Cancerous) - Present
*   0= Benign (Not Cancerous) -Absent



**Dataset:**    
Taken from [UC Berkeley repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29). The dataset has data on a total sample of 569 benign and malignant tumor cells

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('/content/data.csv', index_col=False)

FileNotFoundError: ignored

In [None]:
data.T.head(31)


*    The column **id** gives the Unique ID number of the sample
*   The column **diagnosis** tells if it's M=Malignant or B=Benign
*   The rest of the 30 columns have information on the features of the cell nuclei computed from digitized images of a fine needle aspirate (FNA) of a breast mass.








## Data Cleaning and Wrangling

In [None]:
data.shape

In [None]:
data.drop('id',axis=1, inplace=True)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
sum(data.duplicated())

In [None]:
data.dtypes.value_counts()

In [None]:
data.isnull().any()

In [None]:
data.diagnosis.unique()

In [None]:
data.to_csv('/content/clean_data.csv')

# Exploratory Data Analysis

### Basic Descriptive Statistics

In [None]:
data = pd.read_csv('/content/clean_data.csv', index_col=False)
data.drop('Unnamed: 0',axis=1, inplace=True)

In [None]:
data.describe()

**Skewness** is the measure of symmetry or asymmetry in the distribution of data.
It is either positive(right) or negative(left).         
Values closer to zero means less skew.


In [None]:
data.skew()

In [None]:
no_of_obs=data.groupby('diagnosis', axis=0)
pd.DataFrame(no_of_obs.size(), columns=['Number of observations'])

In [None]:
data.columns

### Data Visualizations

#### Unimodal Analysis

In [None]:
sns.countplot(data['diagnosis'], label='count')

##### Separating features into smaller groups for easy analysis

1.   **_mean suffix features**

*   Histogram
*   Density Plot
*   Boxplot             


2.  **_se suffix features**
*   Histogram
*   Density Plot
*   Boxplot          


3. **_worst suffix features**
*   Histogram
*   Density Plot
*   Boxplot     



In [None]:
def extract_column(word):
  column_mean = [col for col in data.columns if col.endswith(word)]
  return column_mean

def unimodal_analysis_hist(word):
  column_mean=extract_column(word)
  hist=data.hist(column=column_mean,bins=10,figsize=(20, 15))

def unimodal_analysis_density_plot(word):
  column_mean=extract_column(word)
  plt = data[column_mean].plot(kind= 'density', subplots=True, layout=(4,3), sharex=False, 
                     sharey=False,fontsize=12, figsize=(15,10))
  
def unimodal_analysis_boxplot(word):
  column_mean=extract_column(word)
  plt=data[column_mean].plot(kind= 'box' , subplots=True, layout=(4,3), sharex=False, sharey=False,fontsize=12,figsize=(15,10))


In [None]:
unimodal_analysis_hist('_mean')

In [None]:
unimodal_analysis_density_plot('_mean')

Perimeter, Radius, area , concavity and compactness has Exponential Distribution

In [None]:
unimodal_analysis_boxplot('_mean')

In [None]:
unimodal_analysis_hist('_se')

In [None]:
unimodal_analysis_density_plot("_se")

In [None]:
unimodal_analysis_boxplot('_se')

In [None]:
unimodal_analysis_hist('_worst')

In [None]:
unimodal_analysis_density_plot('_worst')

In [None]:
unimodal_analysis_boxplot('_worst')

In [None]:
def kdeplot_feature(word):
  column_mean=extract_column(word)
  for i in range(len(column_mean)):
      sns.FacetGrid(data,hue="diagnosis",aspect=4,margin_titles=True).map(sns.kdeplot,column_mean[i],shade= True).add_legend()

In [None]:
kdeplot_feature('_mean')

In [None]:
kdeplot_feature('_se')

In [None]:
kdeplot_feature('_worst')

In [None]:
def multimodal_analysis_corr(word):
  column_mean=extract_column(word)
  corr=data[column_mean].corr()
  mask=np.zeros_like(corr, dtype=np.bool)
  mask[np.triu_indices_from(mask)]=True

  df, ax=plt.subplots(figsize=(8,8))
  plt.title('Breast Cancer Feature Correlation')

  sns.heatmap(corr, vmax=1.2, square='square', mask=mask, ax=ax, annot=True, fmt='.2g', linewidth=2)

In [None]:
multimodal_analysis_corr('_mean')

In [None]:
multimodal_analysis_corr('_se')

In [None]:
multimodal_analysis_corr('_worst')

In [None]:
f,ax = plt.subplots(figsize=(18, 18))
corr=data.corr()
mask=np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True
cmap = sns.diverging_palette( 240 , 10 , as_cmap = True )
sns.heatmap(corr,annot=True, linewidths=.5, fmt= '.1f',ax=ax, mask=mask, cmap=cmap)
plt.xticks(fontsize=11,rotation=70)
plt.show()

In [None]:
def multimodal_analysis_scatter(word):
  column_mean=extract_column(word)
  cols=column_mean+['diagnosis']
  sns_plot = sns.pairplot(data=data[cols],hue='diagnosis')

In [None]:
multimodal_analysis_scatter('_mean')

In [None]:
multimodal_analysis_scatter('_se')

In [None]:
multimodal_analysis_scatter('_worst')

In [None]:
fig = plt.figure(figsize=(12,12))
def plot(a,b,k):
    plt.subplot(k)
    sns.scatterplot(x = data[a], y = data[b], hue = "diagnosis",data = data)
    plt.title(a + ' vs ' + b,fontsize=15)
    
plot('perimeter_mean','radius_worst',221)   
plot('area_mean','radius_worst',222)   
plot('texture_mean','texture_worst',223)   
plot('area_worst','radius_worst',224)   

In [None]:
fig = plt.figure(figsize=(12,12))
plot('smoothness_mean','texture_mean',221)
plot('radius_mean','fractal_dimension_worst',222)
plot('texture_mean','symmetry_mean',223)
plot('texture_mean','symmetry_se',224)

In [None]:
fig = plt.figure(figsize=(12,12))
plot('area_mean','fractal_dimension_mean',221)
plot('radius_mean','fractal_dimension_mean',222)
plot('area_mean','smoothness_se',223)
plot('smoothness_se','perimeter_mean',224)

Outlier Dropping

## Machine Learning

In [None]:
X=data.drop('diagnosis', axis=1).values
y=data['diagnosis'].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= le.fit_transform(y)

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
X = rs.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca= pca.fit(X)

In [None]:
#pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

In [None]:
with plt.style.context('dark_background'):
    plt.figure(figsize=(6, 4))
    pd.DataFrame(pca.explained_variance_ratio_).plot.bar()
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal componentindex')
    plt.legend(loc='best')
    plt.tight_layout()

In [None]:
cum_explained_variance= np.cumsum(explained_variance)
cum_explained_variance*100

In [None]:
with plt.style.context('dark_background'):
    plt.figure(figsize=(6, 4))
    pd.DataFrame(np.cumsum(pca.explained_variance_ratio_)).plot()
    plt.ylabel('Cumulative Explained variance ratio')
    plt.xlabel('Principal component index')
    plt.legend(loc='best')
    plt.tight_layout()

In [None]:
print('Variance explained by the First principal Component=',np.cumsum(pca.explained_variance_ratio_*100)[0])
print('Variance explained by the First 2 principal Component=',np.cumsum(pca.explained_variance_ratio_*100)[1])
print('Variance explained by the First 3 principal Component=',np.cumsum(pca.explained_variance_ratio_*100)[2])
print('Variance explained by the First 10 principal Component=',np.cumsum(pca.explained_variance_ratio_*100)[9])

In [None]:
from yellowbrick.features import PCA
visualizer_2=PCA(scale=True, projection=2, classes=['malignant','benign'],
                 random_state=0, colors=['red','blue'])
visualizer_2.fit_transform(X,y)
visualizer_2.show()

In [None]:
visualizer_3=PCA(scale=True, projection=3, classes=['malignant','benign'],
                 random_state=0, colors=['red','blue'])
visualizer_3.fit_transform(X,y)
visualizer_3.show()

In [None]:
var= pca.explained_variance_ratio_

plt.plot(var)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')

leg = plt.legend(['Eigenvalues from PCA'], loc='best', borderpad=0.3,shadow=False,markerscale=0.4)
leg.get_frame().set_alpha(0.4)
leg.set_draggable(state=True)
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca_95=PCA(n_components=0.95, random_state=0)
pca_95.fit(X)
X_pca_95=pca_95.transform(X)
X_pca_95.shape

In [None]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### SMOTE- OverSampling Method

In [None]:
#from imblearn.over_sampling import SMOTE
#oversample=SMOTE(sampling_strategy='minority', random_state=0)
#X_train,y_train=oversample.fit_resample(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
def accuracy(model):
  y_pred=model.predict(X_test)
  accuracy=metrics.accuracy_score(y_test,y_pred)
  print("\nAcuuracy Of the Model: ",accuracy,"\n\n")

In [None]:
from sklearn.metrics import classification_report


In [None]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print(classification_report(y_test, y_test_pred))
accuracy(lr)

In [None]:
from pandas.core.common import random_state
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10, p=1, metric='minkowski')
knn.fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
print(classification_report(y_test, y_test_pred))
accuracy(knn)

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', gamma='auto', random_state=0)
svm.fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
print(classification_report(y_test, y_test_pred))
accuracy(svm)


In [None]:
from sklearn import metrics

def confusion_matrix(y_test, model):
  

# Try various Machine Learning Algorithms

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.preprocessing import scale

In [None]:
print ("Result By Naive Bayes Classifier\n\n")
clf_nv = GaussianNB()
clf_nv.fit(X_train,y_train)
clf_nv.score(X_test,y_test)

In [None]:
print ("Result By Logistic Regression\n\n")
training_accuracy = []
test_accuracy = []
for i in range(1,30):
  clf_lr = LogisticRegression(C=i,solver='liblinear').fit(X_train,y_train)
  training_accuracy.append(clf_lr.score(X_train,y_train))
  test_accuracy.append(clf_lr.score(X_test,y_test))
plt.plot(range(1,30),training_accuracy,label="Training")
plt.plot(range(1,30),test_accuracy, label = "Test")
print("\nMax Accuracy "+str(max(test_accuracy))+" At Inverse Regularization = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Inverse Regularization')
plt.legend()
plt.show()

print(max(test_accuracy))

In [None]:
training_accuracy = []
test_accuracy = []
for n_neighbors in range(1,100):
  clf_knn = KNeighborsClassifier(n_neighbors= n_neighbors)
  clf_knn.fit(X_train,y_train)
  training_accuracy.append(clf_knn.score(X_train,y_train))
  test_accuracy.append(clf_knn.score(X_test,y_test))
plt.plot(range(1,100),training_accuracy,label="Training")
plt.plot(range(1,100),test_accuracy, label = "Test")
print("Max Accuracy "+str(max(test_accuracy))+"At Neighbors = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Neighbors')
plt.legend()
plt.show()
print(max(test_accuracy))

In [None]:
print("Result By SVM")
training_accuracy = []
test_accuracy = []
for i in range(1,30):
 clf_svm = LinearSVC(C=i,dual=True).fit(X_train,y_train)
 training_accuracy.append(clf_svm.score(X_train,y_train))
 test_accuracy.append(clf_svm.score(X_test,y_test))
 
plt.plot(range(1,30),training_accuracy,label="Training")
plt.plot(range(1,30),test_accuracy, label = "Test")
print("\n\nMax Accuracy "+str(max(test_accuracy))+" At Inverse Regularization = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Inverse Regularization')
plt.legend()
plt.show()
print(max(test_accuracy))

In [None]:
print("Result By Decision Tree Classifier")
training_accuracy = []
test_accuracy = []
for i in range(1,10):
 clf_dt = DecisionTreeClassifier(max_depth=i, random_state=0).fit(X_train,y_train)
 training_accuracy.append(clf_dt.score(X_train,y_train))
 test_accuracy.append(clf_dt.score(X_test,y_test))
 
plt.plot(range(1,10),training_accuracy,label="Training")
plt.plot(range(1,10),test_accuracy, label = "Test")
print("\n\nMax Accuracy "+str(max(test_accuracy))+" At PrePruning = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Prepruning')
plt.legend()
plt.show()
print(max(test_accuracy))

In [None]:
print("Result By Randomn Forest Classifier")
training_accuracy = []
test_accuracy = []
for i in range(1,20):
 clf_rfc= RandomForestClassifier(n_estimators=i, random_state=0, max_features='auto',min_samples_split=2).fit(X_train,y_train)
 training_accuracy.append(clf_rfc.score(X_train,y_train))
 test_accuracy.append(clf_rfc.score(X_test,y_test))
 
plt.plot(range(1,20),training_accuracy,label="Training")
plt.plot(range(1,20),test_accuracy, label = "Test")
print("\n\nMax Accuracy "+str(max(test_accuracy))+" At Estimators = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Estimators')
plt.legend()
plt.show()
print(max(test_accuracy))

In [None]:
print("Result By Gradient Boost")
training_accuracy = []
test_accuracy = []
for i in range(1,10):
 clf_gb = GradientBoostingClassifier( random_state=0, max_depth=i,learning_rate=0.01).fit(X_train,y_train)
 training_accuracy.append(clf_gb.score(X_train,y_train))
 test_accuracy.append(clf_gb.score(X_test,y_test))
 
plt.plot(range(1,10),training_accuracy,label="Training")
plt.plot(range(1,10),test_accuracy, label = "Test")
print("\n\nMax Accuracy "+str(max(test_accuracy))+" At Estimators = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Prepruining')
plt.legend()
plt.show()
print(max(test_accuracy))

In [None]:
print("Result By Polynomila Kernel")
training_accuracy = []
test_accuracy = []
for i in range(1,10):
 clf_pk = svm.SVC(kernel = 'poly',degree=i).fit(X_train,y_train)
 training_accuracy.append(clf_pk.score(X_train,y_train))
 test_accuracy.append(clf_pk.score(X_test,y_test))
 
plt.plot(range(1,10),training_accuracy,label="Training")
plt.plot(range(1,10),test_accuracy, label = "Test")
print("\n\nMax Accuracy "+str(max(test_accuracy))+" At Degree = "+str(test_accuracy.index(max(test_accuracy))+1))
max_index = test_accuracy.index(max(test_accuracy))+1
plt.plot(max_index,max(test_accuracy), 'ro',label="MAX")
plt.ylabel('% accuracy')
plt.xlabel('Degree')
plt.legend()
plt.show()
print(max(test_accuracy))