# Data Prep for Models
This file handles the data prep for modeling including binning, choosing features, and creating dummy variables.

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/SullyRC/Drug-Patents/PriceDelta/CleanedData.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Company                 1215 non-null   object 
 1   Price                   1215 non-null   float64
 2   PriceStartDate          1215 non-null   object 
 3   Date Added              1215 non-null   object 
 4   InflationAdjustedPrice  1215 non-null   float64
 5   Analysis                1188 non-null   object 
 6   P or E                  1215 non-null   object 
 7   Pre2005Flag             1215 non-null   int64  
 8   PreviousPatents         1215 non-null   int64  
 9   LatestExpiration        1215 non-null   object 
 10  MonthsUntilExpiration   1215 non-null   float64
 11  PriceDelta              1215 non-null   float64
 12  PercentageE             1215 non-null   float64
 13  Net_Income              1215 non-null   float64
 14  US Company              1215 non-null   

We'll subset our dataset to not include Pre2005 Patents

In [4]:
df = df[df['Pre2005Flag']!=1]
df = df.drop(columns=['Pre2005Flag'])

We'll also subset where price is greater than 0.

In [5]:
df = df[df['InflationAdjustedPrice'] > 0]
df = df.drop(columns=['InflationAdjustedPrice'])

We'll change P or E to be "1" representing an extension and "0" representing a patent.

In [6]:
df.loc[df['P or E'] == 'E','EvergreenFlag'] = 1
df.loc[df['P or E'] == 'P','EvergreenFlag'] = 0
df['P or E'] = pd.to_numeric(df['EvergreenFlag'])

Next we'll subset the dataframe to the columns we want.

In [7]:
df = df[['PercentageE','PriceDelta','MonthsUntilExpiration','PreviousPatents',
         'Net_Income','US Company','IncomeUnknown','EvergreenFlag']]

Now we'll bin our continuous data.

In [8]:
df.describe()

Unnamed: 0,PercentageE,PriceDelta,MonthsUntilExpiration,PreviousPatents,Net_Income,US Company,IncomeUnknown,EvergreenFlag
count,1215.0,1215.0,1215.0,1215.0,1215.0,1215.0,1215.0,1215.0
mean,0.122659,-0.122796,134.435391,19.265021,1779033.0,0.899588,0.138272,0.178601
std,0.123053,0.211262,50.539663,24.103714,8852137.0,0.300672,0.345327,0.383175
min,0.0,-0.845604,9.0,0.0,-1991692.0,0.0,0.0,0.0
25%,0.0,-0.09106,95.0,1.0,2809.388,1.0,0.0,0.0
50%,0.12973,-0.043701,141.0,10.0,12423.5,1.0,0.0,0.0
75%,0.183036,-0.026729,179.0,28.0,55866.66,1.0,0.0,0.0
max,1.0,1.502865,228.0,119.0,61595030.0,1.0,1.0,1.0


In [9]:
def binContinuous(column,start,stepsize,df=df):
    binStart = start
    while binStart <= df[column].max():
        binEnd = binStart+stepsize
        binName = column+ str(binStart) +":"+ str(binEnd)
        df.loc[(df[column]>=binStart)&(df[column]<binEnd),binName] = 1
        df.loc[df[binName]!=1,binName]=0
        if binEnd == df[column].max():
            df.loc[df[column]==binEnd,binName]=1
        binStart += stepsize

In [10]:
def ZStandardize(column,df=df):
    df[column] = (df[column] - df[column].mean())/(df[column].std())

In [11]:
binContinuous('PercentageE',0,.2)
ZStandardize('PercentageE')

To ensure that the function works properly we'll create a check column. This column will just add all of the bins together in order to ensure that there is at least one category for each record.

In [12]:
df['Check'] = df['PercentageE0:0.2']+df['PercentageE0.2:0.4']+ df['PercentageE0.4:0.6000000000000001']+df['PercentageE0.6000000000000001:0.8']+df['PercentageE0.8:1.0']
df['Check'].value_counts()

1.0    1215
Name: Check, dtype: int64

In [13]:
df['Net_Income'] = (df['Net_Income'] - df['Net_Income'].mean())/(df['Net_Income'].std())
#binContinuous('Net_Income',-1*(10**6),(10**7))

This all appears to work fine, so we'll continue for the rest of the continuous data.

In [14]:
binContinuous('PriceDelta',-1,.2)
binContinuous('MonthsUntilExpiration',0,12)
binContinuous('PreviousPatents',0,20)

In [15]:
ZStandardize('PriceDelta')
ZStandardize('MonthsUntilExpiration')
ZStandardize('PreviousPatents')

In [16]:
df.shape

(1215, 54)

In [17]:
binned = df.drop(columns=['PercentageE','PriceDelta','MonthsUntilExpiration','PreviousPatents','Net_Income'])
binned.to_csv('mbdata.csv',index=False)

In [18]:
standardized = df[['PercentageE','PriceDelta','MonthsUntilExpiration','PreviousPatents','Net_Income',
                   'IncomeUnknown','US Company','EvergreenFlag']]
standardized.to_csv('msdata.csv',index=False)