<a href="https://colab.research.google.com/github/PranjalMinocha/JPLasteroid/blob/main/Week%202/Aryan_Shukla_week_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting drive for dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Read CSV file

In [42]:
df = pd.read_csv("/content/drive/MyDrive/input/Asteroid_Updated.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


### Taking Sample from dataframe

In [43]:
df.sample(5)

Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,...,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
21877,,2.99685,0.043435,10.022098,209.305659,287.356954,2.866683,3.127018,5.18807,14562.0,...,,,,,,1.89841,MBA,0.189979,1894.942492,153.542479
751067,,3.162726,0.082937,13.55961,156.59545,316.09959,2.900417,3.425034,5.624714,,...,,,,,,,MBA,0.175231,2054.426788,354.6347
297810,,2.398774,0.137652,1.580879,315.125508,234.586699,2.068577,2.728971,3.715286,6061.0,...,,,,,,1.06986,MBA,0.265289,1357.008281,171.012934
825252,,3.147952,0.257985,9.853674,1.756337,53.47353,2.335826,3.960077,5.585348,4519.0,...,,,,,,1.36195,MBA,0.176466,2040.048509,71.509756
64853,,2.245662,0.101523,3.420135,266.808807,7.640945,2.017674,2.473649,3.365307,21290.0,...,,,,,,1.00089,MBA,0.292879,1229.178417,222.262688


### Details of Some Columns
- name- Name of asteroids
- a - Semi major axis
- e - eccentricity
- i - inclination with respect to x-y elliptical plane
- om - longitude of the ascending node
- w - argument of perihelion
- q - perihelion distance
- ad - aphelion distance
- per_y - orbital period
- data_Arc - data arc span(d)
- condition_Code = orbit condition code
- n_obs_used - number of observation used
- H - absolute magnitude parameter
- neo-Near earth object
- pha - Physically hazardous object
- diameter - diameter of asteroids
- extent - Object bi/tri axial ellipsoid dimensions(Km)
- albedo - geometric albedo
- rot_per - rotation period

In [48]:
print("variables in dataset are",list(df.columns))
print("No. of variables are", df.shape[1])

variables in dataset are ['name', 'a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'condition_code', 'n_obs_used', 'H', 'neo', 'pha', 'diameter', 'extent', 'albedo', 'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T', 'G', 'moid', 'class', 'n', 'per', 'ma']
No. of variables are 31


### There are entries with null diameters

In [45]:
# for an instance 
df[df['diameter'].isnull()].head()

Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,...,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
681,Hagar,2.65404,0.171983,11.505647,190.799959,104.993824,2.197591,3.110489,4.323837,40313.0,...,,,,,,1.21545,MBA,0.227952,1579.28137,134.457728
698,Hela,2.610998,0.410284,15.29918,242.551766,91.399514,1.539746,3.682249,4.219081,42540.0,...,0.386,,Sq,S,,0.624487,MCA,0.233612,1541.019467,237.586385
718,Albert,2.63878,0.546301,11.564845,183.887287,156.163668,1.197212,4.080348,4.286601,39478.0,...,,,S,,,0.203359,AMO,0.229932,1565.680891,48.317252
729,Athanasia,2.243362,0.177505,4.234895,95.073806,123.549777,1.845154,2.64157,3.360139,39112.0,...,,,,,,0.841461,MBA,0.293329,1227.290668,295.120065
842,Nicolaia,2.279598,0.209766,7.997715,4.071354,316.957209,1.801415,2.75778,3.441878,37651.0,...,,,,,,0.79823,MBA,0.286363,1257.146084,317.982712


In [46]:
print("Entries with null diameter are :", df['diameter'].isnull().sum())
print("Entries with not null diameter are :", df['diameter'].notnull().sum())

Entries with null diameter are : 702078
Entries with not null diameter are : 137636


- __Cleaning__:
<br/> 
__Problem__: 'diameter' is string type, I will convert to numeric. This gave errors for some diameters because they were corrupted, so I added the argument "errors='coerce'" to set corrupted diameters to nan, and later dropped those.
<br/> 
Dropping irrelevent features and choosing my battles:
<br/> 
dropping names because I dont believe asteroids are useful feature.
<br/> 
Dropping all features with more than half nan values
<br/> 
dropping condition_code and neo and pha because most seems to be 0 or nan.
<br/> 
Replace nans entries with mean value of column

In [47]:
df['diameter']=pd.to_numeric(df['diameter'],errors='coerce') #transforming to numeric, setting errors to NaN
dropindexes = df['diameter'][df['diameter'].isnull()].index #rows with nan diameters to drop
dropped_df = df.loc[dropindexes] #saving dropped rows for the future
df=df[df['diameter'].notnull()]

In [49]:
# after dropping the NaN val
print(df.shape)

(137635, 31)


In [51]:
print("variables in dataset are",list(df.columns))
print("No. of variables are", df.shape[1])

variables in dataset are ['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'n_obs_used', 'H', 'diameter', 'albedo', 'moid', 'class', 'n', 'per', 'ma']
No. of variables are 18


In [50]:
tooMuchNa = df.columns[df.isna().sum()/df.shape[0] > 0.5]
df = df.drop(tooMuchNa,axis=1)
df = df.drop(['condition_code'],axis=1)
df = df.drop(['neo','pha'],axis=1)

In [52]:
df = df.fillna(df.mean())

  """Entry point for launching an IPython kernel.


In [53]:
df.sample(5)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,H,diameter,albedo,moid,class,n,per,ma
295747,2.740023,0.06487,7.108441,24.768268,96.40812,2.562278,2.917767,4.535649,9001.0,258,16.2,4.069,0.039,1.59036,MBA,0.217307,1656.645784,41.616861
235902,3.157194,0.21297,23.54705,79.402643,120.789748,2.484805,3.829583,5.609965,5873.0,278,14.9,5.597,0.081,1.55684,MBA,0.175692,2049.039793,162.478011
60018,3.193277,0.217899,9.757447,193.243072,231.162908,2.497465,3.889089,5.706412,10219.0,921,14.0,8.824,0.052,1.5236,MBA,0.172723,2084.267074,122.87285
104363,2.776176,0.084396,5.391932,147.422655,223.021241,2.541878,3.010473,4.625712,7416.0,766,14.8,7.077,0.032,1.54307,MBA,0.213076,1689.541351,224.935142
125623,2.336655,0.126636,3.746659,104.313854,341.452188,2.040751,2.63256,3.571908,9691.0,469,16.6,2.231,0.098,1.0574,MBA,0.275938,1304.639258,306.444325


In [54]:
df = df.drop(['albedo','H'],axis = 1)

In [55]:
# As discussed in meet some features are in form e^x so taking log to find trend in diameter
df['diameter']= df['diameter'].apply(np.log)
for column in df.columns.drop(['diameter']):
    if(column != "class"):
      df['log('+column+')']=df[column].apply(np.log)
df = df.dropna(axis=1)


In [56]:
df.sample(5)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,...,log(w),log(q),log(ad),log(per_y),log(data_arc),log(n_obs_used),log(moid),log(n),log(per),log(ma)
15336,2.427696,0.293285,22.674203,91.241936,294.923796,1.71569,3.139702,3.78268,24376.0,1310,...,5.686717,0.539815,1.144128,1.330433,10.101354,7.177782,-0.184389,-1.344911,7.231015,5.554684
117704,2.783553,0.181948,12.162393,65.557124,199.60974,2.27709,3.290016,4.644162,13509.0,538,...,5.296364,0.822898,1.190892,1.535611,9.511111,6.287859,0.235127,-1.550089,7.436193,5.759228
86351,2.234735,0.147807,0.932993,7.52738,102.258363,1.904426,2.565044,3.340775,7795.0,499,...,4.627503,0.64418,0.941976,1.206203,8.961238,6.212606,-0.081622,-1.220681,7.106785,5.559886
56352,2.715575,0.209559,8.594212,109.942023,47.158993,2.1465,3.284649,4.47508,23071.0,801,...,3.853525,0.763839,1.18926,1.498524,10.046332,6.685861,0.155087,-1.513002,7.399106,4.01944
4949,2.746756,0.183083,12.700588,189.86779,251.336923,2.243871,3.249642,4.552379,12233.0,2203,...,5.526794,0.808202,1.178545,1.51565,9.411892,7.697575,0.258982,-1.530128,7.416232,5.500261


In [59]:
print("variables in dataset are",list(df.columns))
print("No. of variables are", df.shape[1])

variables in dataset are ['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'n_obs_used', 'diameter', 'moid', 'class', 'n', 'per', 'ma', 'log(a)', 'log(e)', 'log(i)', 'log(om)', 'log(w)', 'log(q)', 'log(ad)', 'log(per_y)', 'log(data_arc)', 'log(n_obs_used)', 'log(moid)', 'log(n)', 'log(per)', 'log(ma)']
No. of variables are 30


### Correlation analysis

In [58]:
df.corr()['diameter'].abs().sort_values(ascending=False)

diameter           1.000000
log(a)             0.563616
log(per_y)         0.563616
log(n)             0.563616
log(per)           0.563616
log(q)             0.543737
log(moid)          0.528689
n                  0.525392
q                  0.522404
moid               0.521095
data_arc           0.519390
n_obs_used         0.511250
log(ad)            0.477252
log(n_obs_used)    0.433656
log(data_arc)      0.298793
a                  0.195634
e                  0.185047
log(e)             0.157921
ad                 0.112606
i                  0.096037
log(i)             0.088749
per                0.046649
per_y              0.046649
ma                 0.030946
log(ma)            0.023154
log(w)             0.006008
w                  0.005310
om                 0.001478
log(om)            0.000169
Name: diameter, dtype: float64

### __TODO__: 
Explore more using KDE
<br/>
Simple analysis graphs
<br/>
why we are using the log - graph explanation
<br/>
create train, test, validation data
<br/>
