In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sma

In [2]:
df = pd.read_csv("CleanedData.csv")
df = df[df["Price"] > 0]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6013 entries, 0 to 6160
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Price                   6013 non-null   float64
 1   PriceStartDate          6013 non-null   object 
 2   Date Added              6013 non-null   object 
 3   InflationAdjustedPrice  6013 non-null   float64
 4   Analysis                5992 non-null   object 
 5   P or E                  6013 non-null   object 
 6   Pre2005Flag             6013 non-null   int64  
 7   PreviousPatents         6013 non-null   int64  
 8   LatestExpiration        6013 non-null   object 
 9   MonthsUntilExpiration   6013 non-null   float64
dtypes: float64(3), int64(2), object(5)
memory usage: 516.7+ KB


In [4]:
mean = df['InflationAdjustedPrice'].mean()
std =df['InflationAdjustedPrice'].std()
df['InflationAdjustedPriceZScore'] = (df['InflationAdjustedPrice'] - mean)/std
df= df[(df['InflationAdjustedPriceZScore'] <=3)&(df['InflationAdjustedPriceZScore'] >=-3)]

In [5]:
df['P or E'].value_counts(normalize=True)

P    0.70129
E    0.29871
Name: P or E, dtype: float64

Assigning binary dependent variable from the Analysis variable. The following categories will be classified as suspected evergreen:
- P:PED
- PTAorPTE
- DlistRequest
- NPP
- PED
- P-PEDExtension
- UCsamemonth
- DP
- DS/DP
- DS/DP/UCnew
- DP/UCnew
- DS

In [6]:
df['EvergreenFlag'] = [0] * len(df)
df.loc[df['P or E']=='E','EvergreenFlag'] = 1

In [7]:
df['EvergreenFlag'].value_counts(normalize=True)

0    0.70129
1    0.29871
Name: EvergreenFlag, dtype: float64

In [8]:
model_df = df[['InflationAdjustedPrice','MonthsUntilExpiration',
              'PreviousPatents','EvergreenFlag']]

In [9]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5815 entries, 0 to 6160
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   InflationAdjustedPrice  5815 non-null   float64
 1   MonthsUntilExpiration   5815 non-null   float64
 2   PreviousPatents         5815 non-null   int64  
 3   EvergreenFlag           5815 non-null   int64  
dtypes: float64(2), int64(2)
memory usage: 227.1 KB


In [10]:
model_df.describe()

Unnamed: 0,InflationAdjustedPrice,MonthsUntilExpiration,PreviousPatents,EvergreenFlag
count,5815.0,5815.0,5815.0,5815.0
mean,296.110427,101.049011,0.783147,0.29871
std,574.105321,57.753713,8.701916,0.457732
min,0.003968,-6.0,0.0,0.0
25%,14.307057,56.0,0.0,0.0
50%,55.773578,92.0,0.0,0.0
75%,269.528371,153.0,0.0,1.0
max,4179.957559,228.0,119.0,1.0


In [11]:
model_df = sma.add_constant(model_df)
clf = sma.Logit(model_df['EvergreenFlag'],model_df.drop(columns='EvergreenFlag')).fit()
print(clf.summary())

         Current function value: 0.319695
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:          EvergreenFlag   No. Observations:                 5815
Model:                          Logit   Df Residuals:                     5811
Method:                           MLE   Df Model:                            3
Date:                Wed, 22 Feb 2023   Pseudo R-squ.:                  0.4757
Time:                        23:01:26   Log-Likelihood:                -1859.0
converged:                      False   LL-Null:                       -3545.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      2.1767      0.088     24.770      0.000       2.004       2.349
InflationAdjustedPrice     0.

