In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

### 1. Load data and create bankruptcy column

In [2]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])
df['bankruptcy'] = (df['class']==b'1')

In [3]:
df.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,b'0',False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,b'0',False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,b'0',False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,b'0',False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,b'0',False


### 2. Create a new dataframe with only 4 feataures (and and Bankruptcy) and rename the columns to X1, X2, X7, and X10

In [4]:
df1 = df[['Attr1', 'Attr2', 'Attr7', 'Attr10', 'bankruptcy']]
df1 = df1.rename(columns={'Attr1':'X1', 'Attr2':'X2', 'Attr7':'X7', 'Attr10':'X10'})

In [5]:
df1.head()

Unnamed: 0,X1,X2,X7,X10,bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False


### 3. Fill-in the missing values (nan) with the column means

In [6]:
mean = df1[['X1', 'X2', 'X7', 'X10']].mean()
df1[['X1', 'X2', 'X7', 'X10']] = df1[['X1', 'X2', 'X7', 'X10']].fillna(mean)

### 4. Find the mean and std of 3 groups

**4.1 Mean and std of all companies**

In [7]:
df1[['X1', 'X2', 'X7', 'X10']].mean()

X1     0.043019
X2     0.596404
X7     0.059446
X10    0.389040
dtype: float64

In [8]:
df1[['X1', 'X2', 'X7', 'X10']].std()

X1     0.359303
X2     4.586887
X7     0.533317
X10    4.590064
dtype: float64

**4.2 Mean and std of bankrupcy companies**

In [9]:
df1[df1['bankruptcy']==True][['X1', 'X2', 'X7', 'X10']].mean()

X1    -0.068873
X2     0.878355
X7    -0.061538
X10    0.103367
dtype: float64

In [10]:
df1[df1['bankruptcy']==True][['X1', 'X2', 'X7', 'X10']].std()

X1     0.568076
X2     1.945596
X7     0.568432
X10    1.946747
dtype: float64

**4.3 Mean and std of still-operating companies**

In [11]:
df1[df1['bankruptcy']==False][['X1', 'X2', 'X7', 'X10']].mean()

X1     0.049231
X2     0.580752
X7     0.066162
X10    0.404899
dtype: float64

In [12]:
df1[df1['bankruptcy']==False][['X1', 'X2', 'X7', 'X10']].std()

X1     0.343002
X2     4.689694
X7     0.530524
X10    4.692934
dtype: float64

### 5. Find companies satisfy the condition, X1 < mean(X1) - stdev(X1) AND X10 < mean(X10) - std(X10)

In [13]:
df2 = df1[(df1.X1 < (df1.X1.mean() - df1.X1.std())) & (df1.X10 < (df1.X10.mean() - df1.X10.std()))]

In [14]:
print('The number of companies satisfy the condition: ', len(df2))

The number of companies satisfy the condition:  15


In [15]:
df2

Unnamed: 0,X1,X2,X7,X10,bankruptcy
2312,-1.0927,5.6368,-1.0927,-4.6368,False
2608,-3.7231,11.53,-3.6424,-10.53,False
3017,-1.948,25.005,-1.948,-24.005,False
3739,-0.72685,6.9334,-0.72685,-5.9334,False
4767,-5.9655,6.6818,-5.9655,-5.6818,False
5001,-3.2845,20.403,-3.2845,-19.403,False
5259,-0.44,16.487,-0.44,-15.487,False
5859,-0.32841,6.1187,-0.32841,-5.1187,False
6264,-0.72755,5.2632,-0.72755,-4.2632,False
7846,-1.9841,13.063,-1.9841,-12.473,False


### 6. The ratio of the bankrupted companies among the sub-groups above

In [16]:
len(df2[df2.bankruptcy==True]) / len(df2)

0.2