In [None]:
import pandas as pd

df = pd.read_csv('dump.csv')

df.head()

columns_to_drop = ["Unnamed: 0", "companyUrn", "mbrLocationCode", "posLocationCode", "companyUrl"]
df_cleaned = df.drop(columns=columns_to_drop)
df_cleaned = df_cleaned.drop_duplicates()

# convert date to datetime format
df_cleaned["startDate"] = pd.to_datetime(df_cleaned["startDate"], errors="coerce")
df_cleaned["endDate"] = pd.to_datetime(df_cleaned["endDate"], errors="coerce")

# creating a new column to determine if presently employed
df_cleaned["currently_employed"] = df_cleaned["endDate"].isna()

# remove rows with missing data
df_cleaned = df_cleaned.dropna(subset=["posTitle", "companyName", "startDate"])

# employed and unemployed groups
df_employed = df_cleaned[df_cleaned["currently_employed"] == True]
df_unemployed = df_cleaned[df_cleaned["currently_employed"] == False]

# percentage of users currently employed
employment_rate = df_cleaned["currently_employed"].mean() * 100

# employment rate shown
df_cleaned.info()
print(f"Percentage of users currently employed: {employment_rate:.2f}%")

In [None]:
df_unemployed.describe()

In [None]:
df_employed.describe()

In [None]:
# Compare average age, followers, gender breakdown, isPremium rate, isFollowable, and skills
comparison_metrics = {
    "Average Age": [df_unemployed["ageEstimate"].mean(), df_employed["ageEstimate"].mean()],
    "Average Followers": [df_unemployed["followersCount"].mean(), df_employed["followersCount"].mean()],
    "Gender Breakdown": [df_unemployed["genderEstimate"].value_counts(normalize=True), df_employed["genderEstimate"].value_counts(normalize=True)],
    "Is Premium Rate": [df_unemployed["isPremium"].mean(), df_employed["isPremium"].mean()],
    "Is Followable Rate": [df_unemployed["followable"].mean(), df_employed["followable"].mean()],
    "Average Skills Count": [df_unemployed["avgMemberPosDuration"].mean(), df_employed["avgMemberPosDuration"].mean()]
}

comparison_df = pd.DataFrame(comparison_metrics, index=["Unemployed", "Employed"])
print(comparison_df)

In [None]:
# weighted formula (avgMemberPosDuration = experience)
weights = {
    "ageEstimate": 0.1,
    "followersCount": 0.2,
    "isPremium": 0.15,
    "followable": 0.1,
    "avgMemberPosDuration": 0.45
}

def calculate_weighted_score(df):
    return (
        df["ageEstimate"].fillna(0) * weights["ageEstimate"] +
        df["followersCount"].fillna(0) * weights["followersCount"] +
        df["isPremium"].fillna(0) * weights["isPremium"] +
        df["followable"].fillna(0) * weights["followable"] +
        df["avgMemberPosDuration"].fillna(0) * weights["avgMemberPosDuration"]
    )

df_cleaned["weighted_score"] = calculate_weighted_score(df_cleaned)

# average weighted scores
df_weighted_scores = df_cleaned.groupby("currently_employed")["weighted_score"].mean()
print(df_weighted_scores)

In [None]:
df_weighted_scores.head()

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns


X = df_cleaned[['ageEstimate', 'followersCount', 'isPremium', 'followable', 'avgMemberPosDuration']]
y = df_cleaned['weighted_score']

# linreg
model = LinearRegression()
model.fit(X, y)

# r-squared/coef
coefficients = model.coef_
intercept = model.intercept_
r_squared = model.score(X, y)

# results
print("Regression Results:")
print(f"Intercept: {intercept}")
feature_names = X.columns
for i, feature in enumerate(feature_names):
    print(f"{feature}: {coefficients[i]}")
print(f"R-squared: {r_squared}")


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns



# relative weights
abs_coefficients = np.abs(coefficients)
relative_importance = abs_coefficients / np.sum(abs_coefficients)

print("\nRelative Importance of Variables:")
for i, feature in enumerate(feature_names):
    print(f"{feature}: {relative_importance[i] * 100:.2f}%")

# graphing
plt.figure(figsize=(12, 6))
sns.set_style("whitegrid")
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': relative_importance * 100
}).sort_values('Importance', ascending=False)

ax = sns.barplot(x='Feature', y='Importance', data=importance_df)
plt.xlabel('Features')
plt.ylabel('Relative Importance (%)')
plt.title('Feature Importance in Weighted Score Formula')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns



original_weights = {
    "ageEstimate": 0.1,
    "followersCount": 0.2,
    "isPremium": 0.15,
    "followable": 0.1,
    "avgMemberPosDuration": 0.45
}

# arbitrary weights vs. comparison
original_weights_values = np.array(list(original_weights.values()))
normalized_original = original_weights_values / np.sum(original_weights_values)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Feature': feature_names,
    'Original Weight (%)': [original_weights[feat] * 100 for feat in feature_names],
    'Regression Weight (%)': coefficients * 100,
    'Original Normalized (%)': normalized_original * 100,
    'Regression Importance (%)': relative_importance * 100
})

print("\nComparison of Original Weights vs. Regression Coefficients:")
print(comparison_df)

# correlation
corr_matrix = X.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Explanatory Variables')
plt.tight_layout()
plt.show()

# graph 2
y_pred = model.predict(X)
residuals = y - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

In [None]:
# Do a correlation matrix
# Does the data have to be normally distributed