***Importing important Libraries***

In [None]:
# importing important libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LassoCV
from yellowbrick.regressor import AlphaSelection
from sklearn.linear_model import Ridge
from yellowbrick.regressor import ManualAlphaSelection
from sklearn.ensemble import RandomForestClassifier

***Two dataset are made named 'dataset' and 'dataset1'***

'dataset1' is used for training the model 

'dataset' is used for visualizing the data


In [None]:
# columns are given as names as given in pdf file
#two dataset file, dataset and dataset1 are made one for visualization and other one for modeling.

my_cols = ['Param1','Param1_un','Param2','Param2_un','Param3','TMass','NiMass','Explosion energy','Snela Mass','Ni Mass']
dataset = pd.read_csv("/content/dataset.csv", header  = None,usecols= range(10), names = my_cols)
dataset1 = pd.read_csv("/content/dataset.csv", header  = None,usecols= range(10), names = my_cols )

***All the observable paraemters in 'dataset' are standarized***

In [None]:
# optical depth, maximum velocity and emergent flux of gamma-rays are standarized for smooth modelling
# dataset is used for data visualization

from sklearn.preprocessing import MinMaxScaler
dataset[['Param1','Param2','Param3']] = MinMaxScaler().fit_transform(dataset[['Param1','Param2','Param3']])

***All the parameters which are standarized are plotted in a single plot***

In [None]:
# the observable variables are plotted against each other
# kde plot of seaborn is used for plotting 

import matplotlib.pyplot as plt
plt.figure(figsize=(8,5), dpi= 80)
sns.kdeplot(dataset["Param1"], shade=True, color="g", label="Optical depth", alpha=.6)
sns.kdeplot(dataset["Param2"], shade=True, color="r", label="V max", alpha=.6)
sns.kdeplot(dataset["Param3"], shade=True, color="b", label="Flux of gamma rays", alpha=.6)
plt.xlabel('Standarize value')
plt.title('Observable Parameters plotted against each other (standarize)')
plt.legend()
plt.savefig("Observable Parameters Plotted ",dpi = 300)

**Now each observable parameter is plotted seperately with their errors.**

***The data of dataset1 is used as non-standarize data is plotted.***

***Observabel parameter 'Optical Depth' is plotted.***

In [None]:
# optical depth value is plotted with adding errors 
# no standarization has been done
# kde has been used again

plt.figure(figsize=(8,5), dpi= 80)
sns.kdeplot(dataset1["Param1"], shade=True, color="g", label=" Actual Optical depth", alpha=1)
sns.kdeplot(dataset1["Param1"] + dataset1["Param1_un"], shade=True,label="with positive error", color="b", alpha=.6)
sns.kdeplot(dataset1["Param1"] - dataset1["Param1_un"], shade=True,label="with negative error" ,color="r", alpha=.6)
plt.xlabel('Actual value')
plt.title('Optical depth plotted ( not standarize)')
plt.legend()
plt.savefig("Optical depth Plotted ",dpi = 300)


***Observable parameter maximum velocity is plotted***

In [None]:
# V max value is plotted with adding errors 
# no standarization has been done
# kde has been used again

plt.figure(figsize=(8,5), dpi= 80)
sns.kdeplot(dataset1["Param2"], shade=True, color="g", label="Actual V max", alpha=1)
sns.kdeplot(dataset1["Param2"] + dataset1["Param2_un"], shade=True, color="b", label="with positive error", alpha=.6)
sns.kdeplot(dataset1["Param2"] - dataset1["Param2_un"], shade=True, color="r", label="with negative error", alpha=.6)
plt.xlabel('Actual value')
plt.title('Velocity max plotted ( not standarize)')
plt.legend()
plt.savefig("Velocity max Plotted ",dpi = 300)


***Observable parameter emergent flux of Gamma Ray is plotted***

In [None]:
# emergent flux of gamma rays is plotted  
# no standarization has been done
# kde has been used again

plt.figure(figsize=(8,5), dpi= 80)
sns.kdeplot(dataset1["Param3"], shade=True, color="c", label="Emergent flux of gamma", alpha=0.8)
plt.xlabel('Actual value')
plt.title('Emergent Flux of Gamma plotted ( not standarize)')
plt.legend()
plt.savefig("Emrergent Flux",dpi = 300)


***A coorelation heat map is made with non-standarize data (dataset1)***

In [None]:
# Correlation heatmap has been made

plt.figure(figsize=(8,5), dpi= 80)
sns.heatmap(dataset1.corr(), annot = True,square=True,cmap= 'coolwarm')
plt.title('Correlation heatmap', fontsize = 28)
plt.savefig("Correlation Heatmap",dpi = 300)
dataset1.corr().to_csv("correlation.csv",index = True, header = True)


***Feature Encoding is done with two cloumn SNela and Ni Mass Flag Distribution***

***The feature encoding is attached as label to 'dataset'***

In [None]:
# Snela Mass distribution flag  and Ni Mass distribution  flag is hot encoded.
# LabelEncoder() is used 
# the hot-encoding is attached to dataset with column name "Snela Flag Label" and "Ni Flag Label" 

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
dataset["Snela Flag label"] = labelencoder.fit_transform(dataset1["Snela Mass"])
dataset["Ni Flag label"] = labelencoder.fit_transform(dataset1["Ni Mass"])
labelencoder.transform(['N100', 'hed8', 'mwd' ,'w7dt'])

***To get more insight of the data, now we are going to seperate the data, with different classes in SNela Mass flag distribution and Ni Mass flag distribution and plot their obserable parameters, each column has 4 different classes and therefore and thier are 2 columns therefore, in total 8 plots will be made with their observable parameters all plotted in single plot.***

***For making the graph we will use standarize data of 'dataset' as different observable parameters are need to be plotted against each other.***

***To seperate the dataset we are going to use two 'for' loops one looping under another, and both the loops will loop over 'list' and 'list1' as made in the following cell***

In [None]:
# two list are initialized
# one for going over type of flag and another one for going over which mass distribution

list11 = list(labelencoder.classes_)
list12 = list(['Snela Mass','Ni Mass'])


In [None]:
# graphs are made to analyze how a type of flag ['N100', 'hed8', 'mwd' ,'w7dt'] in both of "Snela Mass Flag" or "Ni Mass Flag" varies with 
# observable parameters
# two for loops are made one for going over "Snela Mass Flag" or "Ni Mass Flag" and another one for going over type of flag ['N100', 'hed8', 'mwd' ,'w7dt']
# total 8 graphs are made

for outer_index in range(2):
  for inner_index in range(4):
    plt.figure(figsize=(8,5), dpi= 80)
    data_index = dataset[dataset[list12[outer_index]] == list11[inner_index]]
    plot = sns.kdeplot(data_index["Param1"], shade=True, color="g", label="Optical depth", alpha=.6)
    sns.kdeplot(data_index["Param2"], shade=True, color="r", label="V max", alpha=.6)
    sns.kdeplot(data_index["Param3"], shade=True, color="b", label="Flux of gamma rays", alpha=.6)
    plt.title('graph of ' + str(list12[outer_index]) + ' flag distribution of type ' + str(list11[inner_index]))
    plt.xlabel('Actual value (not standarize)')
    data_index.corr().to_csv(str(list12[outer_index]) +" "+ str(list11[inner_index] + ".csv"),index = True,header = True)
    plt.legend()
    plt.savefig(str(list12[outer_index]) +" " +  str(list11[inner_index]),dpi = 300)
    plt.show()
    #plot.get_figure().clf()


***Feature Encoding is attached to dataset1 which is going to be used for training the model***

In [None]:
# dataset1 is used for modelling process as 'dataset' standarize and we don't need standarize data for linear regression and 
# RandomForestClassifier is already, one hot encoding is attached with "Snela Flag Label" and "Ni Flag label" column

dataset1["Snela Flag label"] = labelencoder.fit_transform(dataset1["Snela Mass"])
dataset1["Ni Flag label"] = labelencoder.fit_transform(dataset1["Ni Mass"])

***Three list are made.***

***A list of all physical parameters, list of all observable parameters, and a list of all name of all model of physical parameters.***

In [None]:
# physical_parameters is list of all physical parameters
# columns is list of cloumn names of all the observable parameters 
# model is list of models which function linear_regression_compression  will make

physical_parameters = ['TMass','NiMass','Explosion energy',"Snela Flag label","Ni Flag label"]
columns = ['Param1','Param2','Param3']
model = ['model_TMass','model_Nimass','model_explosion energy','model_Snela_flag','model_Ni_flag']

***A function is made which return the efficiency of a regression model, when input of actual valeus and predicted values are fed to this function.***

***The function return MAE (Mean Absolute Error), MSE (Mean Squared Error), RMSE (Root - Mean Squared Error)***

In [None]:
# a function metric_evaluation is made to compare the regression model efficiency
# this function will be called in another function 

def metric_evaluation (y_test,pred):
  a = metrics.mean_absolute_error(y_test,pred)
  b = metrics.mean_squared_error(y_test,pred)
  c = np.sqrt(metrics.mean_squared_error(y_test,pred))
  return [a,b,c]


***A function is made which return models of all physical parameters***

***Function just need dataset, it will itself divide it into train and test dataset, for making different models, two 'for' iteration will be done one iterating over continous physical parameters and other one iterating over class based physical parameters.***

***All the models will be returned in a form of list.***

In [None]:
# linear_regression_model directly gives all the models and the efficiency of a particular model when train and test dataset are fed to this function.
def linear_regression_comparision (given_data):

# two for loops are made one for making linear regression models and another one for classification model (RandomforestClassifier is used)
# data is splitted with train_test_split in ratio of 0.2 and only selected data is given.
# selected data means that all observable parameters and the physical parameters which is needed is only given for training and testing

  for indexing in range(3):
  
    X_train, X_test, y_train, y_test = train_test_split(given_data[columns],given_data[physical_parameters[indexing]] ,test_size = 0.2,random_state = 45)
    
    # train and test data are standarize after splitting to avoid any type of baisness one dataset can put into another
    
    
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler1.fit_transform(X_test)

    print(physical_parameters[indexing])
    
    print("linear_regression")
    model[indexing] = linear_model.LinearRegression()
    model[indexing].fit(X_train,y_train)
    pred = model[indexing].predict(X_test)
    
    # metric_evaluaiton function is called to evaluate the model

    MAE, MSE , RSQUARE = metric_evaluation(y_test,pred)
    print("MAE =" , MAE,"MSE = ", MSE ,"RSQUARE = ",RSQUARE)
    print("--------")

  for index in range(2):

    X_train, X_test, y_train, y_test = train_test_split(given_data[columns],given_data[physical_parameters[index + 3]] ,test_size = 0.2,random_state = 45)
   
    # in RandomForestClassifier we have not standarization of data has been done

    print(physical_parameters[index + 3])
    print("Random Forest Classifier")
    
    # n_estimators = 500

    model[index + 3]=RandomForestClassifier(n_estimators=500)
    model[index + 3].fit(X_train,y_train)
    pred=model[index + 3].predict(X_test) 
    
    print("Accuracy:",metrics.accuracy_score(y_test, pred))
    print("----------")

  # all the models which are made for each physical parameters are returned 
  return model

    # lasso and ridge are not suggessted as these models are used when number of input variable are very large (around 12-15), here 3
    # the change in alpha value literally did not affected much when tried on this dataset


    #lasso = LassoCV(alphas=alphas)
    #visualizer = AlphaSelection(lasso)
    #visualizer.fit(X_train, y_train)
    #plt.legend()
    #visualizer.show()

    #Ridge_model = ManualAlphaSelection(Ridge(),alphas=alphas,cv=12,scoring="neg_mean_squared_error")
    #Ridge_model.fit(X_train, y_train)

***Above function is fed 'dataset1' the non - standarize data.***

In [None]:
model1,model2,model3,model4,model5 =  linear_regression_comparision(dataset1)

***The test data is converted to pandas.DataFrame***

In [None]:
data = [[3.35,0.015,0.000012],[2.54,0.013,0.00000502],[2.46,0.013,0.0000103]]
test = pd.DataFrame(data,columns = ['Param1','Param2','Param3'])
test

***A function is made which outputs a dataframe of all physical parameters predicted when dataset is fed to it, class columns are inversely transformed to get the classes of SNela and Ni Mass Flag Distribution.***

In [None]:
#physical_parameters_value directly gives the dataframe for the given test dataset

def physical_parameters_value (data):
  index_da = ['Case1','Case2','Case3']
  T_mass = model1.predict(data)
  Ni_mass = model2.predict(data)
  Explosion_energy = model3.predict(data)
  Snela_energy_flag = labelencoder.inverse_transform(model4.predict(data))
  Ni_energy_flag = labelencoder.inverse_transform(model5.predict(data))
  
  #labelencoder.inverse_transform reverse the hot-encoded effect

  dataframe = pd.DataFrame(list(zip(T_mass,Ni_mass,Explosion_energy,Snela_energy_flag,Ni_energy_flag)),index = index_da, columns = ['Total Mass','Ni Mass','Explosion Energy','Snela Mass distribution Flag','Ni Mass distribution Flag'])
  return dataframe

In [None]:
predicted_value = physical_parameters_value(test)
predicted_value

***The predicted dataframe is exported in a '.csv' format***

In [None]:
predicted_value.to_csv("predicted_value.csv", index = True, header = True)