In [1]:
import requests
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
headers = {'Authorization': f'token {token}'}
url = 'https://api.github.com/search/users'
params = {'q': 'location:Mumbai followers:>50', 'per_page': 100}
users_list = []

for page in range(1, 8):  
    params['page'] = page
    response = requests.get(url, headers=headers, params=params)
    users_data = response.json().get('items', [])
    
    for user in users_data:
        user_url = f"https://api.github.com/users/{user['login']}"
        user_response = requests.get(user_url, headers=headers)
        user_info = user_response.json()
        company = user_info.get('company', '')
        if company:
            company = company.strip().lstrip('@').upper()
        users_list.append({
            'login': user_info['login'],
            'name': user_info.get('name', ''),
            'company': company,
            'location': user_info.get('location', ''),
            'email': user_info.get('email', ''),
            'hireable': user_info.get('hireable', False),
            'bio': user_info.get('bio', ''),
            'public_repos': user_info['public_repos'],
            'followers': user_info['followers'],
            'following': user_info['following'],
            'created_at': user_info['created_at'],
        })

    if len(users_data) < 100:
        break

users_df = pd.DataFrame(users_list)
users_df.to_csv('users.csv', index=False)

In [44]:
repositories_list = []
for login in users_df['login']:
    repos_url = f"https://api.github.com/users/{login}/repos"
    params = {'per_page': 100}  
    for page in range(1, 6):
        params['page'] = page
        response = requests.get(repos_url, headers=headers, params=params)
        repos_data = response.json()
        if not repos_data:
            break
        for repo in repos_data:
            repositories_list.append({
                'login': login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

repositories_df = pd.DataFrame(repositories_list)
repositories_df.to_csv('repositories.csv', index=False)

KeyboardInterrupt: 

#### Q1.  R

In [20]:
users_df = pd.read_csv('users.csv')
top_users = users_df.sort_values(by='followers', ascending=False).head(5)
top_users_logins = ', '.join(top_users['login'].tolist())
print(top_users_logins)

ValentineFernandes, kovidgoyal, slidenerd, aryashah2k, coding-parrot


#### Q2.

In [21]:
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)
earliest_users_logins = ', '.join(earliest_users['login'].tolist())
print(earliest_users_logins)

ivank, sandeepshetty, svs, nitinhayaran, nischal


#### Q3.

In [22]:
repos_df = pd.read_csv('repositories.csv')
repos_with_license = repos_df.dropna(subset=['license_name'])
top_licenses = repos_with_license['license_name'].value_counts().head(3)
top_licenses_names = ', '.join(top_licenses.index.tolist())
print(top_licenses_names)

mit, apache-2.0, other


#### Q4. 

In [23]:
companies = users_df['company'].dropna()
most_common_company = companies.value_counts().idxmax()
print(most_common_company)

MASAI SCHOOL


#### Q5. 

In [24]:
repos_df = pd.read_csv('repositories.csv')
languages = repos_df['language'].dropna()
most_common_language = languages.value_counts().idxmax()
print(most_common_language)

JavaScript


#### Q6.

In [25]:
users_after_2020 = users_df[users_df['created_at'] > '2020-12-31']
merged_df = pd.merge(users_after_2020, repos_df, on='login')
languages = merged_df['language'].dropna()
second_most_common_language = languages.value_counts().index[1]
print(second_most_common_language)

HTML


In [26]:
users_after_2020 = users_df[users_df['created_at'] > '2020-12-31']
merged_df = pd.merge(users_after_2020, repos_df, on='login')
languages = merged_df['language'].dropna()
second_most_common_language = languages.value_counts().index[1]
print("The second most popular programming language is:", second_most_common_language)

The second most popular programming language is: HTML


In [27]:
users_after_2020 = users_df[users_df['created_at'] > '2020-12-31']
merged_df = pd.merge(users_after_2020, repos_df, on='login')
languages = merged_df['language'].dropna()
languages = languages[languages != 'HTML']  # Exclude HTML
second_most_common_language = languages.value_counts().index[1]
print("The second most popular programming language is:", second_most_common_language)

The second most popular programming language is: Python


#### Q7.

In [28]:
repos_with_language = repos_df.dropna(subset=['language', 'stargazers_count'])
average_stars_per_language = repos_with_language.groupby('language')['stargazers_count'].mean()
most_popular_language = average_stars_per_language.idxmax()
print(most_popular_language)

TSQL


In [29]:
import pandas as pd

# Assuming `repos_df` has columns: 'language' and 'stargazers_count'
# Filter out repositories without a specified language or stars count
repos_with_language = repos_df.dropna(subset=['language', 'stargazers_count'])

# Step 1: Define a function to remove outliers using IQR within each language group
def remove_outliers(group):
    Q1 = group['stargazers_count'].quantile(0.25)
    Q3 = group['stargazers_count'].quantile(0.75)
    IQR = Q3 - Q1
    return group[(group['stargazers_count'] >= Q1 - 1.5 * IQR) & (group['stargazers_count'] <= Q3 + 1.5 * IQR)]

# Step 2: Apply the outlier removal function
filtered_repos = repos_with_language.groupby('language').apply(remove_outliers).reset_index(drop=True)

# Step 3: Calculate the average stars for each language after removing outliers
average_stars_per_language = filtered_repos.groupby('language')['stargazers_count'].mean()

# Step 4: Find the language with the highest average stars
highest_avg_star_language = average_stars_per_language.idxmax()
highest_avg_star_value = average_stars_per_language.max()

print(f"The language with the highest average number of stars per repository (after outlier treatment) is {highest_avg_star_language} with an average of {highest_avg_star_value} stars.")


The language with the highest average number of stars per repository (after outlier treatment) is GLSL with an average of 17.0 stars.


#### Q8. 

In [35]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)
top_leader_logins = ', '.join(top_leader_strength['login'].tolist())
print(top_leader_logins)

kovidgoyal, coding-parrot, gkcs, slidenerd, dmalvia


#### Q9.

In [30]:
correlation = users_df['followers'].corr(users_df['public_repos'])
print(f"{correlation:.3f}")

0.032


#### Q10.

In [31]:
X = users_df['public_repos'].values.reshape(-1, 1)
y = users_df['followers'].values
model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]
print(f"{slope:.3f}")

0.094


In [32]:
repos_df = pd.read_csv('repositories.csv')
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")

Correlation between projects and wiki enabled: 0.173


#### Q11.

In [38]:
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])
print(f"{correlation:.3f}")

0.173


In [18]:
repos_df = pd.read_csv('repositories.csv')

# Ensure "has_projects" and "has_wiki" are strings, then map them to numeric values
repos_df['has_projects'] = repos_df['has_projects'].astype(str).str.lower().map({'true': 1, 'false': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].astype(str).str.lower().map({'true': 1, 'false': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Display the correlation rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: 0.173


#### Q12.

In [39]:
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
difference = avg_following_hireable - avg_following_non_hireable
print(f"{difference:.3f}")

nan


#### Q13.

In [19]:
users_with_bio = users_df.dropna(subset=['bio'])
users_with_bio['bio_length'] = users_with_bio['bio'].apply(len)
X = users_with_bio['bio_length'].values.reshape(-1, 1)
y = users_with_bio['followers'].values
model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]
print(f"{slope:.3f}")

-0.160


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_length'] = users_with_bio['bio'].apply(len)


#### Q14.

In [41]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
weekend_repos = repos_df[repos_df['created_at'].dt.weekday >= 5]
weekend_repos_count = weekend_repos['login'].value_counts().head(5)
top_weekend_creators = ', '.join(weekend_repos_count.index.tolist())
print(top_weekend_creators)

backtrackbaba, devdatta95, ankit0183, Kushal334, burhanuday


#### Q15.

In [42]:
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notna().mean()
difference = hireable_with_email - non_hireable_with_email
print(f"{difference:.3f}")

nan


#### Q16.

In [43]:
users_with_names = users_df.dropna(subset=['name'])
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]
surname_counts = users_with_names['surname'].value_counts()
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()
most_common_surnames_str = ', '.join(sorted(most_common_surnames))
print(most_common_surnames_str)
print(max_count)

Singh
17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]
