In [1]:
import pandas as pd
df = pd.read_csv("student-data.csv")

In [3]:
len(df.index)

395

In [4]:
df.shape

(395, 31)

In [7]:
df.ix()

<pandas.core.indexing._IXIndexer at 0x7efea7c58510>

In [9]:
sum(df.passed == 'no')

130

In [10]:
sum(df.passed == 'yes')

265

In [13]:
df.drop('school')

ValueError: labels ['school'] not contained in axis

In [16]:
df.columns[1:-1]

Index([u'sex', u'age', u'address', u'famsize', u'Pstatus', u'Medu', u'Fedu',
       u'Mjob', u'Fjob', u'reason', u'guardian', u'traveltime', u'studytime',
       u'failures', u'schoolsup', u'famsup', u'paid', u'activities',
       u'nursery', u'higher', u'internet', u'romantic', u'famrel', u'freetime',
       u'goout', u'Dalc', u'Walc', u'health', u'absences'],
      dtype='object')

In [18]:
df[3, 1]

KeyError: (3, 1)

In [19]:
# 提取特征列
feature_cols = list(df.columns[:-1])

# 提取目标列 ‘passed’
target_col = df.columns[-1] 

# 显示列的列表
print "Feature columns:\n{}".format(feature_cols)
print "\nTarget column: {}".format(target_col)

# 将数据分割成特征数据和目标数据（即X_all 和 y_all）
X_all = df[feature_cols]
y_all = df[target_col]

# 通过打印前5行显示特征信息
print "\nFeature values:"
print X_all.head()

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...       

In [20]:
def preprocess_features(X):
    ''' 预处理学生数据，将非数字的二元特征转化成二元值（0或1），将分类的变量转换成虚拟变量
    '''
    
    # 初始化一个用于输出的DataFrame
    output = pd.DataFrame(index = X.index)

    # 查看数据的每一个特征列
    for col, col_data in X.iteritems():
        
        # 如果数据是非数字类型，将所有的yes/no替换成1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # 如果数据类型是类别的（categorical），将它转换成虚拟变量
        if col_data.dtype == object:
            # 例子: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # 收集转换后的列
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [31]:
from sklearn.model_selection import ShuffleSplit
# TODO：设置训练集的数量
num_train = 300

# TODO：设置测试集的数量
num_test = X_all.shape[0] - num_train

# TODO：把数据集混洗和分割成上面定义的训练集和测试集
rs = ShuffleSplit(n_splits=1, test_size=num_test, train_size=num_train, random_state=0)
for train_index, test_index in rs.split(range(395)):
    train=list(train_index)
    test=list(test_index)

In [24]:
list((1,2))

[1, 2]

In [42]:
type(train)

list

In [44]:
X_all.ix[test]

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,age,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
329,1.0,0.0,1.0,0.0,17,0.0,1.0,1.0,0.0,0.0,...,1,1,1,4,3,3,1,2,4,4
318,1.0,0.0,1.0,0.0,17,1.0,0.0,1.0,0.0,0.0,...,1,1,0,4,3,4,2,5,5,0
317,1.0,0.0,1.0,0.0,18,0.0,1.0,1.0,0.0,0.0,...,1,1,1,4,3,4,1,1,5,9
65,1.0,0.0,1.0,0.0,16,0.0,1.0,0.0,1.0,0.0,...,1,1,0,5,4,3,1,2,1,2
59,1.0,0.0,1.0,0.0,16,0.0,1.0,1.0,0.0,0.0,...,1,1,0,4,2,3,1,1,5,2
287,1.0,0.0,1.0,0.0,17,0.0,1.0,1.0,0.0,0.0,...,1,1,0,4,3,3,1,1,3,6
212,1.0,0.0,1.0,0.0,16,0.0,1.0,1.0,0.0,1.0,...,1,1,0,3,3,4,1,1,4,0
60,1.0,0.0,1.0,0.0,16,1.0,0.0,1.0,0.0,0.0,...,1,0,0,2,4,4,2,3,4,6
394,0.0,1.0,0.0,1.0,19,0.0,1.0,0.0,1.0,0.0,...,1,1,0,3,2,3,3,3,5,5
324,1.0,0.0,1.0,0.0,17,0.0,1.0,0.0,1.0,0.0,...,1,1,0,3,3,3,2,3,2,0
