<a href="https://colab.research.google.com/github/PradeepIITMBS/TDS-PROJECT-1/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import statsmodels.api as sm

In [3]:
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

In [4]:
# 1. Who are the top 5 users in Singapore with the highest number of followers? List their login in order, comma-separated.
top_five_users = users_df.sort_values(by='followers', ascending=False).head(5)['login'].tolist()
print( ",".join(top_five_users))

yyx990803,halfrost,DIYgod,yangshun,bytedance


In [5]:
# 2. Who are the 5 earliest registered GitHub users in Singapore? List their login in ascending order of created_at, comma-separated.
earliest_five_users = users_df.sort_values(by='created_at', ascending=True).head(5)['login'].tolist()
print(",".join(earliest_five_users))

chuyeow,choonkeat,winston,cheeaun,nowa


In [6]:
# 3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.
repos_with_license = repos_df[repos_df['license_name'].notnull()]
top_3_licenses = repos_with_license['license_name'].value_counts().head(3).index.tolist()
print(",".join(top_3_licenses))

mit,apache-2.0,other


In [7]:
# 4. Which company do the majority of these developers work at?
users_with_company = users_df[users_df['company'].notnull()].copy()
users_with_company.loc[:, 'company'] = users_with_company['company'].str.strip().str.lstrip('@').str.upper()
most_common_company = users_with_company['company'].value_counts().idxmax()
print( most_common_company)

NATIONAL UNIVERSITY OF SINGAPORE


In [8]:
# 5. Which programming language is most popular among these users?
repos_with_language = repos_df[repos_df['language'].notnull()]
most_common_language = repos_with_language['language'].value_counts().idxmax()
print(most_common_language)

JavaScript


In [9]:
# 6. Which programming language is the second most popular among users who joined after 2020?
users_df['created_at'] = pd.to_datetime(users_df['created_at'], utc=True)
comparison_date = pd.to_datetime('2020-01-01').tz_localize('UTC')
users_after_2020 = users_df[users_df['created_at'] > comparison_date]
second_most_common_language = repos_df[repos_df['language'].notnull() & repos_df['login'].isin(users_after_2020['login'])]['language'].value_counts().nlargest(2).idxmin()
print(second_most_common_language)

Python


In [10]:
# 7. Which language has the highest average number of stars per repository?
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print( avg_stars_per_language)

Inno Setup


In [11]:
# 8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()
print(",".join(top_5_leader_strength))

bytedance,Jinjiang,cloudflare,Shib-Chain,rustdesk


In [12]:
# 9. What is the correlation between the number of followers and the number of public repositories among users in Singapore?
correlation = users_df['followers'].corr(users_df['public_repos'])
print(round(correlation, 3))

0.046


In [13]:
# 10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
X = users_df['public_repos']
y = users_df['followers']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
slope = model.params.iloc[1]
print(round(slope, 3))

1.448


In [14]:
# 11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print(round(correlation_projects_wiki, 3))

0.304


In [20]:
# 12. Do hireable users follow more people than those who are not hireable?
average_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
average_following_non_hireable = users_df[users_df['hireable'] != True]['following'].mean()
following_difference = average_following_hireable - average_following_non_hireable
print(f"{following_difference:.3f}")

221.167


In [16]:
# 13. Some developers write long bios. Does that help them get more followers?
users_df['bio_length'] = users_df['bio'].str.len()
bio_length_correlation = users_df[users_df['bio'].notna()]['bio_length'].corr(users_df['followers'])
print(f"{bio_length_correlation:.3f}")

0.050


In [17]:
# 14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['created_on_weekend'] = repos_df['created_at'].dt.dayofweek >= 5
weekend_repos = repos_df[repos_df['created_on_weekend']]
top_users_weekend = weekend_repos['login'].value_counts().head(5).index.tolist()
print(','.join(top_users_weekend))

alextanhongpin,SOF3,shantanu561993,KennyDizi,vdt


In [21]:
# 15. Do people who are hireable share their email addresses more often?
fraction_email_hireable = users_df[users_df['hireable'] == True]['email'].notna().mean()
fraction_email_non_hireable = users_df[users_df['hireable'] != True]['email'].notna().mean()
email_fraction_difference = fraction_email_hireable - fraction_email_non_hireable
print(f"{email_fraction_difference:.3f}")

0.074


In [19]:
# 16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
users_df['surname'] = users_df['name'].str.split().str[-1]
common_surnames = users_df['surname'].value_counts()
most_common_surname = common_surnames.idxmax()
most_common_surname_count = common_surnames.max()
print(f"Ans 16 : Most common surname: {most_common_surname} with {most_common_surname_count} users")

Ans 16 : Most common surname: Wang with 14 users
