In [1]:
%matplotlib inline
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import scipy.stats
import pandas.io.sql as psql
import psycopg2 as pg
from statsmodels import api as sm
import itertools
matplotlib.style.use('ggplot')

# Database connection
with pg.connect(database='research') as conn:
    sql = 'SELECT full_name, age, size, commits, internal_cohesion, degree_centrality, firm_involvement \
        FROM repositories ORDER BY id LIMIT 1000;'
    data = psql.read_sql(sql, conn)

sub = data[["age", "size", "commits", "internal_cohesion", "firm_involvement", "degree_centrality"]]
sub_data = sub.copy()

sub_data["age"] = pd.to_numeric(sub_data["age"], errors="coerce")
sub_data["size"] = pd.to_numeric(sub_data["size"], errors="coerce")
sub_data["commits"] = pd.to_numeric(sub_data["commits"], errors="coerce")
sub_data["internal_cohesion"] = pd.to_numeric(sub_data["internal_cohesion"], errors="coerce")
sub_data["firm_involvement"] = pd.to_numeric(sub_data["firm_involvement"], errors="coerce")
sub_data["degree_centrality"] = pd.to_numeric(sub_data["degree_centrality"], errors="coerce")

sub_data["internal_cohesion_c"] = sub_data["internal_cohesion"] - sub_data["internal_cohesion"].mean()
sub_data["degree_centrality_c"] = sub_data["degree_centrality"] - sub_data["degree_centrality"].mean()
sub_data["ic_fi"] = sub_data["internal_cohesion"] * sub_data["firm_involvement"]
sub_data["dc_fi"] = sub_data["degree_centrality"] * sub_data["firm_involvement"]
sub_data["ic_fi_c"] = sub_data["ic_fi"] - sub_data["ic_fi"].mean()
sub_data["dc_fi_c"] = sub_data["dc_fi"] - sub_data["dc_fi"].mean()

# TODO check
sub_data["ic_n"] = sub_data["internal_cohesion"].div(sub_data["internal_cohesion"].sum(1), axis=0)
sub_data["dc_n"] = sub_data["degree_centrality"].div(sub_data["degree_centrality"].sum(1), axis=0)




ImportError: No module named 'psycopg2'

In [None]:
# commits数のヒストグラム
commits_hist = plt.figure()
sub_data['commits'].plot.hist(bins=10)
commits_hist.show()

In [2]:
# 散布図
fig = plt.figure()
labels = ['commits', 'firm_involvement', 'degree_centrality', 'internal_cohesion']
for l1, l2 in itertools.combination(labels, 2):
    sub_data.plot(kind='scatter', x=l2, y=l1)
fig.show()

NameError: name 'itertools' is not defined

<matplotlib.figure.Figure at 0x1147925c0>

In [28]:
sub_data.corr(method='pearson')

Unnamed: 0,age,size,commits,internal_cohesion,firm_involvement,degree_centrality,internal_cohesion_c,degree_centrality_c,ic_fi,dc_fi,ic_fi_c,dc_fi_c
age,1.0,0.060155,0.141055,0.052815,0.076795,0.114404,0.052815,0.114404,0.0775,0.093701,0.0775,0.093701
size,0.060155,1.0,0.2627,0.08752,0.122219,0.180282,0.08752,0.180282,0.141628,0.181857,0.141628,0.181857
commits,0.141055,0.2627,1.0,0.434514,0.259657,0.681768,0.434514,0.681768,0.494005,0.649221,0.494005,0.649221
internal_cohesion,0.052815,0.08752,0.434514,1.0,0.162507,0.602621,1.0,0.602621,0.697788,0.503314,0.697788,0.503314
firm_involvement,0.076795,0.122219,0.259657,0.162507,1.0,0.196595,0.162507,0.196595,0.459425,0.328013,0.459425,0.328013
degree_centrality,0.114404,0.180282,0.681768,0.602621,0.196595,1.0,0.602621,1.0,0.61843,0.891674,0.61843,0.891674
internal_cohesion_c,0.052815,0.08752,0.434514,1.0,0.162507,0.602621,1.0,0.602621,0.697788,0.503314,0.697788,0.503314
degree_centrality_c,0.114404,0.180282,0.681768,0.602621,0.196595,1.0,0.602621,1.0,0.61843,0.891674,0.61843,0.891674
ic_fi,0.0775,0.141628,0.494005,0.697788,0.459425,0.61843,0.697788,0.61843,1.0,0.720445,1.0,0.720445
dc_fi,0.093701,0.181857,0.649221,0.503314,0.328013,0.891674,0.503314,0.891674,0.720445,1.0,0.720445,1.0


In [25]:
model = sm.formula.ols(formula='commits ~ internal_cohesion_c + degree_centrality_c + ic_fi_c + dc_fi_c + firm_involvement', data=sub_data)

result = model.fit()
result.summary()

In [31]:
model2 = sm.formula.ols(formula='commits ~ internal_cohesion_c + ic_fi_c + firm_involvement', data=sub_data)
result2 = model2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,commits,R-squared:,0.265
Model:,OLS,Adj. R-squared:,0.262
Method:,Least Squares,F-statistic:,119.2
Date:,"Wed, 04 Jan 2017",Prob (F-statistic):,5.99e-66
Time:,12:40:46,Log-Likelihood:,-8477.9
No. Observations:,998,AIC:,16960.0
Df Residuals:,994,BIC:,16980.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,543.8050,52.244,10.409,0.000,441.283 646.327
firm_involvement[T.True],226.7570,88.691,2.557,0.011,52.713 400.801
internal_cohesion_c,144.7142,28.376,5.100,0.000,89.031 200.398
ic_fi_c,290.8443,39.910,7.288,0.000,212.527 369.162

0,1,2,3
Omnibus:,880.25,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39925.835
Skew:,3.829,Prob(JB):,0.0
Kurtosis:,33.025,Cond. No.,5.99


In [33]:
model3 = sm.formula.ols(formula='commits ~ degree_centrality_c + dc_fi_c + firm_involvement', data=sub_data)
result3 = model3.fit()
result3.summary()

0,1,2,3
Dep. Variable:,commits,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.482
Method:,Least Squares,F-statistic:,310.3
Date:,"Wed, 04 Jan 2017",Prob (F-statistic):,3.86e-142
Time:,12:45:49,Log-Likelihood:,-8301.5
No. Observations:,998,AIC:,16610.0
Df Residuals:,994,BIC:,16630.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,508.8041,42.415,11.996,0.000,425.570 592.038
firm_involvement[T.True],312.1625,69.455,4.494,0.000,175.868 448.457
degree_centrality_c,14.5481,1.357,10.725,0.000,11.886 17.210
dc_fi_c,3.3609,1.519,2.213,0.027,0.380 6.342

0,1,2,3
Omnibus:,933.363,Durbin-Watson:,2.12
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49734.774
Skew:,4.152,Prob(JB):,0.0
Kurtosis:,36.572,Cond. No.,169.0
