In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import json
%matplotlib inline

In [19]:
# read json data and return as dataframe
def read_data(filename):
    columns = ['English', 'Physics', 'Chemistry', 'Mathematics', 
               'Computer Science', 'Biology', 'Physical Education', 
               'Economics', 'Accountancy', 'Business Studies']
    data = None
    with open(filename) as f:
        lines = f.readlines()
        print(f'reading objects: {lines[0]}')
        json_objs = [json.loads(line) for line in lines[1:]]
        data = pd.DataFrame(json_objs)
    return data

In [20]:
training_data = read_data('training.json')

reading objects: 79465



In [21]:
training_data.head()

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,serial,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
0,8.0,7.0,3.0,4,6,195490,,,,,
1,1.0,1.0,1.0,3,3,190869,,,,,
2,1.0,2.0,2.0,1,2,3111,,,,,
3,8.0,7.0,6.0,7,7,47738,,,,,
4,1.0,1.0,1.0,3,2,85520,,,,,


In [22]:
training_data.count()

Physics              65750
Chemistry            65817
PhysicalEducation    32403
English              79465
Mathematics          79465
serial               79465
Biology              14853
Accountancy          13561
BusinessStudies      13459
Economics            14032
ComputerScience      18520
dtype: int64

In [23]:
training_data.describe()

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,serial,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
count,65750.0,65817.0,32403.0,79465.0,79465.0,79465.0,14853.0,13561.0,13459.0,14032.0,18520.0
mean,3.992593,3.983971,3.186032,3.072044,4.247858,111242.377638,3.599004,2.613672,2.893751,2.469142,3.87635
std,2.213173,2.16575,1.912542,1.796456,2.357411,64406.827932,2.071209,1.748936,1.95259,1.727387,2.150361
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0,2.0,55404.0,2.0,1.0,1.0,1.0,2.0
50%,4.0,4.0,3.0,3.0,4.0,111250.0,3.0,2.0,2.0,2.0,4.0
75%,6.0,6.0,4.0,4.0,6.0,167014.0,5.0,4.0,4.0,3.0,6.0
max,8.0,8.0,8.0,8.0,8.0,222844.0,8.0,8.0,8.0,8.0,8.0


In [24]:
training_data = training_data.sample(frac=1).reset_index(drop=True)

In [25]:
training_data

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Mathematics,serial,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
0,,,,1,1,170902,,1.0,1.0,1.0,
1,7.0,6.0,,5,6,190710,8.0,,,,
2,5.0,4.0,,7,3,137957,2.0,,,,
3,1.0,1.0,,1,1,196101,1.0,,,,
4,2.0,3.0,5.0,2,2,158684,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
79460,5.0,4.0,2.0,2,2,183420,,,,,
79461,5.0,3.0,3.0,5,5,103986,,,,,
79462,1.0,1.0,,1,2,154204,2.0,,,,
79463,3.0,2.0,4.0,4,1,118382,,,,,


In [30]:
# drop mathematics column as it is not required
training_data.fillna(0, inplace=True)
target = training_data['Mathematics']
training_data.drop(['serial', 'Mathematics'],axis=1, inplace=True)

In [34]:
target = pd.DataFrame(target)
target.head()

Unnamed: 0,Mathematics
0,1
1,6
2,3
3,1
4,2


In [70]:
training_data.corr()

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
Physics,1.0,0.840691,0.400833,0.390902,0.186064,-0.4796,-0.476254,-0.451116,0.197819
Chemistry,0.840691,1.0,0.387115,0.35404,0.190633,-0.486363,-0.482962,-0.456656,0.222957
PhysicalEducation,0.400833,0.387115,1.0,0.365254,-0.252049,-0.187988,-0.19595,-0.220906,-0.286875
English,0.390902,0.35404,0.365254,1.0,-0.019773,0.085361,0.104091,0.06937,0.03361
Biology,0.186064,0.190633,-0.252049,-0.019773,1.0,-0.147743,-0.14668,-0.148227,-0.187716
Accountancy,-0.4796,-0.486363,-0.187988,0.085361,-0.147743,1.0,0.86165,0.823781,-0.166332
BusinessStudies,-0.476254,-0.482962,-0.19595,0.104091,-0.14668,0.86165,1.0,0.800056,-0.165306
Economics,-0.451116,-0.456656,-0.220906,0.06937,-0.148227,0.823781,0.800056,1.0,-0.169788
ComputerScience,0.197819,0.222957,-0.286875,0.03361,-0.187716,-0.166332,-0.165306,-0.169788,1.0


In [54]:
correlations = [(subject,target['Mathematics'].corr(training_data[subject])) for subject in training_data.columns]

In [55]:
correlations

[('Physics', 0.5008527783268543),
 ('Chemistry', 0.5014673449062091),
 ('PhysicalEducation', 0.22923645339661822),
 ('English', 0.4273422657146754),
 ('Biology', 0.12889957917003309),
 ('Accountancy', 0.1898775856431167),
 ('BusinessStudies', 0.16755941995892576),
 ('Economics', 0.16283735363839436),
 ('ComputerScience', 0.09736686731730619)]

In [57]:
from sklearn.linear_model import LogisticRegression

In [64]:
clf = LogisticRegression()

In [72]:
clf.fit(training_data[['English','Physics','Economics']], target['Mathematics'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [73]:
clf.score(training_data[['English','Physics','Economics']], target['Mathematics'])

0.30050965834014975

In [None]:
test_data = read_data('')