In [1]:
import pandas as pd
users = pd.read_csv("/content/users.csv")

In [3]:
sorted_df = users.sort_values(by='followers', ascending=False)

In [41]:
users['created_at'] = pd.to_datetime(users['created_at'])
earliest_users = users.nsmallest(5, 'created_at')

In [42]:
earliest_users

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength,bio_word_count
375,kana,Kana Natsuno,,"Tokyo, Japan",,true,"To Vim, or not to Vim.",142,982,563,2008-02-29 17:09:01,1.741135,6
466,kakutani,Kakutani Shintaro,"FREELANCE, RUBY-NO-KAI","Tokyo, Japan",shintaro@kakutani.com,true,a plain-old agile rubyist-ist. keeb zombie.\r\n,57,523,47,2008-03-04 17:11:22,10.895833,6
172,mootoh,Motohiro Takayama,"NEWMO, INC",Tokyo,mootoh@gmail.com,true,,70,217,50,2008-03-07 09:57:53,4.254902,0
179,lhl,Leonard,AUGMXNT,Tokyo,,none,Independent technologist,99,215,73,2008-03-08 09:29:25,2.905405,2
145,walf443,"Keiji, Yoshimi",PIXIV,"Tokyo, Japan",walf443 at gmail dot com,none,,168,227,334,2008-03-09 22:42:01,0.677612,0


In [12]:
repo = pd.read_csv("/content/repositories.csv")

In [13]:
valid_licenses = repo[repo['license_name'].notnull()]
license_counts = valid_licenses['license_name'].value_counts()
top_licenses = license_counts.head(3)
top_licenses_list = list(zip(top_licenses.index, top_licenses.values))


for license_name, count in top_licenses_list:
    print(f"{license_name}: {count}")

MIT License: 845
Other: 214
Apache License 2.0: 189


In [14]:
company_counts = users['company'].value_counts()
most_common_company = company_counts.idxmax()
most_common_count = company_counts.max()

print(f"The majority of developers work at: {most_common_company} (Count: {most_common_count})")

The majority of developers work at: GOOGLE (Count: 12)


In [15]:
language_counts = repo['language'].value_counts()
most_popular_language = language_counts.idxmax()
most_popular_count = language_counts.max()

print(f"The most popular programming language among these users is: {most_popular_language} (Count: {most_popular_count})")

The most popular programming language among these users is: JavaScript (Count: 343)


In [16]:
users['created_at'] = pd.to_datetime(users['created_at'])
recent_users = users[users['created_at'] > '2020-01-01']
recent_logins = recent_users['login'].unique()
recent_repos = repo[repo['login'].isin(recent_logins)]
language_counts = recent_repos['language'].value_counts()
if len(language_counts) >= 2:
    second_most_popular_language = language_counts.index[1]
    second_most_popular_count = language_counts.values[1]
else:
    second_most_popular_language = None
    second_most_popular_count = 0


if second_most_popular_language:
    print(f"The second most popular programming language among users who joined after 2020 is: {second_most_popular_language} (Count: {second_most_popular_count})")
else:
    print("Not enough data to determine the second most popular programming language.")

The second most popular programming language among users who joined after 2020 is: JavaScript (Count: 7)


In [17]:
average_stars_per_language = repo.groupby('language')['stargazers_count'].mean()
highest_average_language = average_stars_per_language.idxmax()
highest_average_stars = average_stars_per_language.max()
print(f"The programming language with the highest average number of stars per repository is: {highest_average_language} (Average Stars: {highest_average_stars:.2f})")

The programming language with the highest average number of stars per repository is: PHP (Average Stars: 15479.00)


In [18]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_leaders = users.nlargest(5, 'leader_strength')
top_leaders_info = top_leaders[['login', 'name', 'leader_strength']]
print("Top 5 users by leader_strength:")
print(top_leaders_info)

Top 5 users by leader_strength:
              login              name  leader_strength
484         blueimp  Sebastian Tschan           4044.0
476         dai-shi       Daishi Kato           3467.5
489       asahilina        Asahi Lina           3006.0
494  pilcrowonpaper           pilcrow           2607.0
481          marcan     Hector Martin           2281.5


In [19]:
correlation = users['followers'].corr(users['public_repos'])
print(f"Correlation between followers and public repositories in Tokyo: {correlation:.3f}")

Correlation between followers and public repositories in Tokyo: 0.053


In [22]:
!pip install pandas statsmodels
import pandas as pd
import statsmodels.api as sm

X = users['public_repos']
Y = users['followers']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
followers_per_repo = model.params['public_repos']
print(f"Estimated additional followers per additional public repository: {followers_per_repo:.3f}")

Estimated additional followers per additional public repository: 0.289


In [24]:
repo['has_projects'] = repo['has_projects'].astype(int)
repo['has_wiki'] = repo['has_wiki'].astype(int)
correlation = repo['has_projects'].corr(repo['has_wiki'])


print(f"Correlation between projects enabled and wiki enabled: {correlation:.3f}")

Correlation between projects enabled and wiki enabled: 0.548


In [34]:
average_following_hireable = users[users['hireable'] == 'true']['following'].mean()
average_following_non_hireable = users[users['hireable'] == 'none']['following'].mean()
average_difference = average_following_hireable - average_following_non_hireable


print(f"Average following for hireable users minus average following for non-hireable users: {average_difference}")

Average following for hireable users minus average following for non-hireable users: -80.07290493440078


In [30]:
average_following_hireable

180.43956043956044

In [31]:
average_following_non_hireable

nan

In [33]:
users['hireable'].unique()

array(['none', 'true'], dtype=object)

In [38]:
import pandas as pd
import numpy as np
from scipy import stats

def word_count(string):
    return len(string.split())
users['bio_word_count'] = users['bio'].fillna('').apply(word_count)

df_with_bio = users[users['bio_word_count'] > 0]

correlation = df_with_bio['bio_word_count'].corr(df_with_bio['followers'])

slope, intercept, r_value, p_value, std_err = stats.linregress(df_with_bio['bio_word_count'], df_with_bio['followers'])

print(f"Correlation between bio word count and followers: {correlation:.3f}")
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Correlation between bio word count and followers: 0.109
Regression slope of followers on bio word count: 0.001


In [44]:
total_hireable = users[users['hireable'] == 'true'].shape[0]
hireable_with_email = users[users['hireable'] == 'true']['email'].notna().sum()
fraction_hireable_with_email = hireable_with_email / total_hireable if total_hireable > 0 else 0
total_non_hireable = users[users['hireable'] == 'none'].shape[0]
non_hireable_with_email = users[users['hireable'] == 'none']['email'].notna().sum()
fraction_non_hireable_with_email = non_hireable_with_email / total_non_hireable if total_non_hireable > 0 else 0
email_fraction_difference = fraction_hireable_with_email - fraction_non_hireable_with_email

print(f"Fraction difference in users with email: {email_fraction_difference:.3f}")

Fraction difference in users with email: 0.130


In [45]:
users_with_names = users[users['name'].notna()]
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]
surname_counts = users_with_names['surname'].value_counts()
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()
most_common_surnames.sort()
result = ', '.join(most_common_surnames)


print(f"The most common surname(s): {result}")

The most common surname(s): Kato, Tanaka


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]


In [54]:
import pandas as pd

df = pd.read_csv('repositories.csv')
df['created_at'] = pd.to_datetime(df['created_at'])
df['day_of_week'] = df['created_at'].dt.dayofweek
weekend_repos = df[df['day_of_week'].isin([5, 6])]
weekend_counts = weekend_repos['login'].value_counts()
top_5_users = weekend_counts.head(5).index.tolist()

result = ','.join(top_5_users)
print(result)

azu,hajimehoshi,kishikawakatsumi,kazupon,privatenumber


In [58]:
import pandas as pd
import numpy as np
from scipy import stats


df = pd.read_csv('users.csv')


def word_count(bio):
    if pd.isna(bio):
        return 0
    return len(str(bio).split())


df['bio_words'] = df['bio'].apply(word_count)


df_with_bio = df[df['bio_words'] > 0]


slope, intercept, r_value, p_value, std_err = stats.linregress(
    df_with_bio['bio_words'],
    df_with_bio['followers']
)

print(f"{slope:.7f}")

18.9406997


In [66]:
import pandas as pd
import numpy as np


df = pd.read_csv('/content/repositories.csv')
contingency_table = pd.crosstab(df['has_projects'], df['has_wiki'])
n = contingency_table.sum().sum()
t = contingency_table.values

chi2 = (n * (t[0,0]*t[1,1] - t[0,1]*t[1,0])**2) / ((t[0,0]+t[0,1]) * (t[0,0]+t[1,0]) * (t[1,0]+t[1,1]) * (t[0,1]+t[1,1]))
phi = np.sqrt(chi2 / n)

if (t[0,0]*t[1,1]) < (t[0,1]*t[1,0]):
    phi = -phi

print(f"Correlation: {phi:.3f}")

Correlation: 0.548
