In [26]:
import pandas as pd

df = pd.read_csv('dump.csv')

df.head()

columns_to_drop = ["Unnamed: 0", "companyUrn", "mbrLocationCode", "posLocationCode", "companyUrl"]
df_cleaned = df.drop(columns=columns_to_drop)
df_cleaned = df_cleaned.drop_duplicates()

# convert date to datetime format
df_cleaned["startDate"] = pd.to_datetime(df_cleaned["startDate"], errors="coerce")
df_cleaned["endDate"] = pd.to_datetime(df_cleaned["endDate"], errors="coerce")

# creating a new column to determine if presently employed
df_cleaned["currently_employed"] = df_cleaned["endDate"].isna()

# remove rows with missing data
df_cleaned = df_cleaned.dropna(subset=["posTitle", "companyName", "startDate"])

# employed and unemployed groups
df_employed = df_cleaned[df_cleaned["currently_employed"] == True]
df_unemployed = df_cleaned[df_cleaned["currently_employed"] == False]

# percentage of users currently employed
employment_rate = df_cleaned["currently_employed"].mean() * 100

# employment rate shown
df_cleaned.info()
print(f"Percentage of users currently employed: {employment_rate:.2f}%")

<class 'pandas.core.frame.DataFrame'>
Index: 39519 entries, 0 to 39536
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ageEstimate            39519 non-null  float64       
 1   companyFollowerCount   38056 non-null  float64       
 2   companyHasLogo         36393 non-null  object        
 3   companyName            39519 non-null  object        
 4   companyStaffCount      38385 non-null  float64       
 5   connectionsCount       39519 non-null  float64       
 6   country                39519 non-null  object        
 7   endDate                30625 non-null  datetime64[ns]
 8   followable             39519 non-null  float64       
 9   followersCount         39519 non-null  float64       
 10  genderEstimate         35551 non-null  object        
 11  hasPicture             29592 non-null  object        
 12  isPremium              39519 non-null  float64       
 13  mbrLoc

In [27]:
df_unemployed.describe()

Unnamed: 0,ageEstimate,companyFollowerCount,companyStaffCount,connectionsCount,endDate,followable,followersCount,isPremium,positionId,startDate,avgMemberPosDuration,avgCompanyPosDuration
count,30625.0,29502.0,29756.0,30625.0,30625,30625.0,30625.0,30625.0,30625.0,30625,30625.0,30625.0
mean,38.528229,229889.4,22558.058173,427.428473,2011-07-20 22:27:41.936326656,0.947298,1165.716343,0.129796,1239144.0,2009-04-09 07:34:01.743673344,836.800614,902.24012
min,20.0,0.0,0.0,0.0,1970-02-01 00:00:00,0.0,0.0,0.0,23.0,1962-01-01 00:00:00,0.0,-91.0
25%,31.0,1963.0,133.0,381.0,2008-09-01 00:00:00,1.0,362.0,0.0,169794.0,2006-01-01 00:00:00,494.25,747.2308
50%,38.0,17420.0,1608.0,500.0,2012-12-01 00:00:00,1.0,657.0,0.0,239691.0,2010-10-01 00:00:00,714.9,912.7557
75%,45.0,148934.0,10735.0,500.0,2015-11-01 00:00:00,1.0,1180.0,0.0,291306.0,2014-03-01 00:00:00,1034.25,1048.3585
max,86.0,7502740.0,568533.0,500.0,2019-02-01 00:00:00,1.0,161922.0,1.0,38454430.0,2019-02-01 00:00:00,15492.5,6453.0
std,9.56206,712325.6,71647.270173,120.414579,,0.223442,2832.582829,0.336084,5591642.0,,571.234805,320.237393


In [28]:
df_employed.describe()

Unnamed: 0,ageEstimate,companyFollowerCount,companyStaffCount,connectionsCount,endDate,followable,followersCount,isPremium,positionId,startDate,avgMemberPosDuration,avgCompanyPosDuration
count,8894.0,8554.0,8629.0,8894.0,0,8894.0,8894.0,8894.0,8894.0,8894,8673.0,8044.0
mean,38.11727,160260.6,13219.378491,415.092309,NaT,0.960198,1334.689454,0.132449,9015320.0,2014-12-17 00:23:57.733303296,1010.190338,834.90895
min,20.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,22.0,1973-05-01 00:00:00,0.0,-91.0
25%,30.0,509.0,33.0,343.0,NaT,1.0,329.0,0.0,191086.2,2014-01-01 00:00:00,536.375,635.3753
50%,37.0,6946.0,489.0,500.0,NaT,1.0,637.0,0.0,271826.5,2016-05-01 00:00:00,817.7333,858.5587
75%,45.0,85348.0,5797.0,500.0,NaT,1.0,1259.0,0.0,25071170.0,2017-07-01 00:00:00,1218.3333,992.5
max,86.0,7502740.0,568533.0,500.0,NaT,1.0,161922.0,1.0,38454420.0,2019-02-01 00:00:00,10255.0,9497.0
std,9.838484,625490.8,50358.593669,132.080548,,0.195505,3423.515015,0.338997,14457350.0,,823.713043,364.603051


In [29]:
# Compare average age, followers, gender breakdown, isPremium rate, isFollowable, and skills
comparison_metrics = {
    "Average Age": [df_unemployed["ageEstimate"].mean(), df_employed["ageEstimate"].mean()],
    "Average Followers": [df_unemployed["followersCount"].mean(), df_employed["followersCount"].mean()],
    "Gender Breakdown": [df_unemployed["genderEstimate"].value_counts(normalize=True), df_employed["genderEstimate"].value_counts(normalize=True)],
    "Is Premium Rate": [df_unemployed["isPremium"].mean(), df_employed["isPremium"].mean()],
    "Is Followable Rate": [df_unemployed["followable"].mean(), df_employed["followable"].mean()],
    "Average Skills Count": [df_unemployed["avgMemberPosDuration"].mean(), df_employed["avgMemberPosDuration"].mean()]
}

comparison_df = pd.DataFrame(comparison_metrics, index=["Unemployed", "Employed"])
print(comparison_df)

            Average Age  Average Followers  \
Unemployed    38.528229        1165.716343   
Employed      38.117270        1334.689454   

                                             Gender Breakdown  \
Unemployed  genderEstimate
male      0.674161
female    0....   
Employed    genderEstimate
male      0.656344
female    0....   

            Is Premium Rate  Is Followable Rate  Average Skills Count  
Unemployed         0.129796            0.947298            836.800614  
Employed           0.132449            0.960198           1010.190338  


In [40]:
# weighted formula (avgMemberPosDuration = experience)
weights = {
    "ageEstimate": 0.1,
    "followersCount": 0.2,
    "isPremium": 0.15,
    "followable": 0.1,
    "avgMemberPosDuration": 0.45
}

def calculate_weighted_score(df):
    return (
        df["ageEstimate"].fillna(0) * weights["ageEstimate"] +
        df["followersCount"].fillna(0) * weights["followersCount"] +
        df["isPremium"].fillna(0) * weights["isPremium"] +
        df["followable"].fillna(0) * weights["followable"] +
        df["avgMemberPosDuration"].fillna(0) * weights["avgMemberPosDuration"]
    )

df_cleaned["weighted_score"] = calculate_weighted_score(df_cleaned)

# average weighted scores
df_weighted_scores = df_cleaned.groupby("currently_employed")["weighted_score"].mean()
print(df_weighted_scores)

currently_employed
False    613.670567
True     714.155516
Name: weighted_score, dtype: float64


In [38]:
df_weighted_scores.head()

currently_employed
False    613.670567
True     714.155516
Name: weighted_score, dtype: float64

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns


X = df_cleaned[['ageEstimate', 'followersCount', 'isPremium', 'followable', 'avgMemberPosDuration']]
y = df_cleaned['weighted_score']

# linreg
model = LinearRegression()
model.fit(X, y)

# r-squared/coef
coefficients = model.coef_
intercept = model.intercept_
r_squared = model.score(X, y)

# results
print("Regression Results:")
print(f"Intercept: {intercept}")
feature_names = X.columns
for i, feature in enumerate(feature_names):
    print(f"{feature}: {coefficients[i]}")
print(f"R-squared: {r_squared}")

# relative weights
abs_coefficients = np.abs(coefficients)
relative_importance = abs_coefficients / np.sum(abs_coefficients)

print("\nRelative Importance of Variables:")
for i, feature in enumerate(feature_names):
    print(f"{feature}: {relative_importance[i] * 100:.2f}%")

# graphing
plt.figure(figsize=(12, 6))
sns.set_style("whitegrid")
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': relative_importance * 100
}).sort_values('Importance', ascending=False)

ax = sns.barplot(x='Feature', y='Importance', data=importance_df)
plt.xlabel('Features')
plt.ylabel('Relative Importance (%)')
plt.title('Feature Importance in Weighted Score Formula')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

original_weights = {
    "ageEstimate": 0.1,
    "followersCount": 0.2,
    "isPremium": 0.15,
    "followable": 0.1,
    "avgMemberPosDuration": 0.45
}

# arbitrary weights vs. comparison
original_weights_values = np.array(list(original_weights.values()))
normalized_original = original_weights_values / np.sum(original_weights_values)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Feature': feature_names,
    'Original Weight (%)': [original_weights[feat] * 100 for feat in feature_names],
    'Regression Weight (%)': coefficients * 100,
    'Original Normalized (%)': normalized_original * 100,
    'Regression Importance (%)': relative_importance * 100
})

print("\nComparison of Original Weights vs. Regression Coefficients:")
print(comparison_df)

# correlation
corr_matrix = X.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Explanatory Variables')
plt.tight_layout()
plt.show()

# graph 2
y_pred = model.predict(X)
residuals = y - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Do a correlation matrix
# Does the data have to be normally distributed