# Building Pipeline

## Importing Libraries

In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.impute import SimpleImputer

## Suppressing all of the warning

In [2]:
# Suppress all warnings
warnings.filterwarnings("ignore")

## Following are the Functions and each function performs Specific Task



In [4]:
# Function to import the dataset
def imp(file_path):
    dataframe = pd.read_csv(file_path)
    return dataframe

# Function to check for missing values
def check_null(dataframe):
    datanul = dataframe.isnull().sum()
    missing_cols = [col for col in datanul.index if datanul[col] > 0]
    print(f'\nColumns with missing values: {len(missing_cols)}')
    if len(missing_cols) > 0:
        print(f'Missing values found in columns: {missing_cols}')

# Updated function to select features and handle missing values
def extract_and_display_features(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # List of selected features
    selected_features = ['gene_7705', 'gene_6199', 'gene_6763', 'gene_6963', 'gene_6707', 'gene_6721',
                         'gene_7583', 'gene_7619', 'gene_7913', 'gene_7850', 'gene_7071', 'gene_5788',
                         'gene_7764', 'gene_6185', 'gene_7090', 'gene_7394', 'gene_4593', 'gene_7774',
                         'gene_7755', 'gene_7224', 'gene_6922', 'gene_7954', 'gene_6271', 'gene_4867',
                         'gene_7813', 'gene_7976', 'gene_5809', 'gene_6719', 'gene_6988', 'gene_7428',
                         'gene_7773', 'gene_7922', 'gene_7651', 'gene_5603', 'gene_4079', 'gene_7502',
                         'gene_5544', 'gene_7787', 'gene_7236', 'gene_7953', 'gene_6882', 'gene_7565',
                         'gene_7703', 'gene_7685', 'gene_7333', 'gene_6134', 'gene_7896', 'gene_7277',
                         'gene_7043', 'gene_6445', 'gene_6649', 'gene_6747', 'gene_7307', 'gene_7415',
                         'gene_6403', 'gene_7614', 'gene_7205', 'gene_7766', 'gene_7284', 'gene_7760',
                         'gene_7821', 'gene_5812', 'gene_7898', 'gene_7944', 'gene_7854', 'gene_4338',
                         'gene_7594', 'gene_7048', 'gene_7215', 'gene_7785', 'gene_7864', 'gene_7733',
                         'gene_4937', 'gene_6652', 'gene_7359', 'gene_7422', 'gene_7259', 'gene_6408',
                         'gene_7273', 'gene_6289', 'gene_7297', 'gene_7178', 'gene_7659', 'gene_6182',
                         'gene_4874', 'gene_7476', 'gene_7931', 'gene_7509', 'gene_6377', 'gene_5623',
                         'gene_6827', 'gene_7570', 'gene_7990', 'gene_7212', 'gene_7294', 'gene_7416',
                         'gene_7634', 'gene_7335', 'gene_7838', 'gene_6107', 'gene_7219', 'gene_5549',
                         'gene_7554', 'gene_6256', 'gene_5208', 'gene_7504', 'gene_7871', 'gene_7788',
                         'gene_6421', 'gene_7989', 'gene_7805', 'gene_7835', 'gene_6131', 'gene_7376',
                         'gene_7832', 'gene_7032', 'gene_7361', 'gene_6575', 'gene_4570', 'gene_6904',
                         'gene_6916', 'gene_7720', 'gene_7231', 'gene_7725', 'gene_5861', 'gene_5853',
                         'gene_7563', 'gene_7985']

    # Extract features
    X = df[selected_features]

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    # Convert to DataFrame for better display
    X_df = pd.DataFrame(X, columns=selected_features)

    print("Extracted Features:")
    print(X_df.head())  # Display the first few rows
    return X_df

# Function to load the trained model
def load_model(dataframe):
    model_loaded = pickle.load(open('/content/Updated_Model_For_Cancer_Prediction', 'rb'))
    df = dataframe.drop(columns=['Cancer_Type'])
    print('\nModel is loaded')
    return model_loaded, df

# Function to make predictions
def pred(model_loaded, df):
    print('\nModel predictions are: \n')
    prediction = model_loaded.predict(df)
    return prediction

# Function to convert predictions to a CSV file
def convert_to_csv(prediction, dataframe):
    # Create a DataFrame with the original labels and predicted labels
    dataframe['Predicted_Cancer_Type'] = prediction

    # Display the DataFrame with the predicted labels
    print("Predictions have been added to the DataFrame:")
    print(dataframe[['Cancer_Type', 'Predicted_Cancer_Type']])

    # Save the DataFrame to CSV
    dataframe.to_csv('/content/Predicted_File.csv', index=False)
    print('\nThe entire DataFrame with predictions is saved as a CSV file in /content/ with the name "Predicted_File.csv"')
    return dataframe

## Main Function (Pipeline)

In [5]:
# Main function to run the pipeline
def main(file_path):
    # Load the data
    dataframe = imp(file_path)
    print(dataframe.head())

    # Check for missing values
    check_null(dataframe)

    # Select specified features
    dataframe = extract_and_display_features(file_path)
    dataframe['Cancer_Type'] = imp(file_path)['Cancer_Type']
    print(dataframe.head())

    # Load model and prepare data
    model_loaded, df = load_model(dataframe)
    print(model_loaded)
    print('\n')

    # Make predictions
    prediction = pred(model_loaded, df)
    print(prediction)

    # Convert predictions to CSV and display DataFrame
    predictions_df = convert_to_csv(prediction, dataframe)
    print(predictions_df.head())

## Passing Data

In [6]:
file_path = '/content/cancer_gene_expression2.csv'
if os.path.exists(file_path):
    main(file_path)
else:
    print(f"File not found: {file_path}")

   gene_1  gene_2  gene_3    gene_4  gene_5  gene_6  gene_7  gene_8    gene_9  \
0     0.0     0.0     0.0  2.088413     0.0     0.0     0.0     0.0  0.550605   
1     0.0     0.0     0.0  3.205955     0.0     0.0     0.0     0.0  0.425244   
2     0.0     0.0     0.0  4.746646     0.0     0.0     0.0     0.0  2.639417   
3     0.0     0.0     0.0  1.173191     0.0     0.0     0.0     0.0  1.527371   
4     0.0     0.0     0.0  1.366532     0.0     0.0     0.0     0.0  0.000000   

    gene_10  ...  gene_7992  gene_7993  gene_7994  gene_7995  gene_7996  \
0  2.815760  ...  11.558803   8.881802   6.014840   6.643534  11.740624   
1  2.354396  ...  11.062829   9.032864   5.054193   6.432320  12.104985   
2  1.657091  ...  12.497640   7.198160   0.943434   7.371690  11.202356   
3  2.732899  ...  11.261713   8.725676   6.300418   6.036451  11.732303   
4  3.388355  ...  12.241965   7.685204   5.142948   6.355788  11.493950   

   gene_7997  gene_7998  gene_7999  gene_8000  Cancer_Type  
0