In [35]:
# 라이브러리
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np  
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.colors import ListedColormap

In [2]:
# data load
df = pd.read_csv('./dataset/global_tech_salary.txt', delimiter=',')

In [3]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,MI,FT,Data Analyst,165000,USD,165000,AU,0,AU,M
1,2023,MI,FT,Data Analyst,70000,USD,70000,US,100,US,M
2,2024,MI,FT,Machine Learning Engineer,85000,EUR,94444,IE,100,IE,M
3,2024,SE,FT,Data Scientist,92700,USD,92700,US,0,US,M
4,2023,MI,FT,Research Engineer,150000,USD,150000,US,0,US,M


In [4]:
print(df.columns)

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')


In [5]:
# 중복값 제거
df = df.drop_duplicates()

# 중복값 제거 후 데이터 크기 확인
print(f"중복값 제거 후 데이터 크기: {df.shape}")

중복값 제거 후 데이터 크기: (3856, 11)


In [6]:
# 원본 데이터에서 'salary_currency', 'salary 제거하고 새로운 데이터 사용
df = df.drop(columns=['salary_currency', 'salary'])

In [7]:
# 1. Label Encoding
label_encoders = {}
for column in ['experience_level', 'company_size']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # 인코더 저장

# 2. One-Hot Encoding
one_hot_columns = ['employment_type', 'job_title', 'employee_residence', 'company_location']
df = pd.get_dummies(df, columns=one_hot_columns, drop_first=True)

# 출력 확인
print(df)

      work_year  experience_level  salary_in_usd  remote_ratio  company_size  \
0          2023                 2         165000             0             1   
1          2023                 2          70000           100             1   
2          2024                 2          94444           100             1   
3          2024                 3          92700             0             1   
4          2023                 2         150000             0             1   
...         ...               ...            ...           ...           ...   
4993       2023                 3         152000             0             1   
4994       2024                 2         151000             0             1   
4995       2023                 3          75577           100             1   
4997       2024                 3         153000             0             1   
4998       2022                 0          31520           100             1   

      employment_type_FL  employment_ty

In [8]:
df.describe()

Unnamed: 0,work_year,experience_level,salary_in_usd,remote_ratio,company_size
count,3856.0,3856.0,3856.0,3856.0,3856.0
mean,2023.149637,2.400415,147213.519969,34.725104,0.953579
std,0.741342,0.913618,69216.650809,47.026522,0.276485
min,2020.0,0.0,15000.0,0.0,0.0
25%,2023.0,2.0,98968.25,0.0,1.0
50%,2023.0,3.0,140000.0,0.0,1.0
75%,2024.0,3.0,185800.0,100.0,1.0
max,2024.0,3.0,750000.0,100.0,2.0


### LinearRegression

In [10]:
# 목표 변수와 설명 변수 분리
X = df.drop(columns=['salary_in_usd'])
y = df['salary_in_usd']

In [11]:
# 데이터셋 분할 (훈련 세트 80%, 테스트 세트 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# 선형 회귀 모델 초기화
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

In [13]:
# 예측
y_pred = model.predict(X_test)

In [14]:
# 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")  # MSE 값이 낮을 수록 좋은 모델
print(f"R-squared: {r2}") # R-squared (R²) : 1에 가까울 수록 좋은 모델

Mean Squared Error: 2779163776.737675
R-squared: 0.31536472623724043


### 로지스틱 회귀

In [17]:
numeric_df = df.select_dtypes(include=['number'])

In [18]:
X = numeric_df.iloc[:,:-1].values
y = numeric_df.iloc[:, -1].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train) # 훈련

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# 정확도 반환
classifier.score(X_test, y_test)

0.9132124352331606

In [31]:
# 시각화할 x, y 값의 범위 설정
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

In [32]:
# meshgrid로 범위 설정
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))


<span style="color:red">feature의 개수가 2개여야하는데 여기서는 특성 개수가 237개임;;</span>

In [38]:
# 각 점에서의 예측값 계산
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



ValueError: X has 2 features, but LinearRegression is expecting 237 features as input.

In [36]:
# 시각화
cmap_background = ListedColormap(['#FFAAAA', '#AAAAFF', '#AAFFAA'])
cmap_points = ListedColormap(['red', 'blue', 'green'])

In [37]:
plt.contourf(xx, yy, Z, cmap=cmap_background, alpha=0.3)  # 결정 경계
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_points, edgecolors='k')  # 데이터 포인트
plt.title("Logistic Regression Decision Boundary")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

NameError: name 'Z' is not defined

In [26]:
X_range = np.arange(min(X[:, 0]), max(X[:, 0]), 0.1)
X_range         # 2차원

array([2020. , 2020.1, 2020.2, 2020.3, 2020.4, 2020.5, 2020.6, 2020.7,
       2020.8, 2020.9, 2021. , 2021.1, 2021.2, 2021.3, 2021.4, 2021.5,
       2021.6, 2021.7, 2021.8, 2021.9, 2022. , 2022.1, 2022.2, 2022.3,
       2022.4, 2022.5, 2022.6, 2022.7, 2022.8, 2022.9, 2023. , 2023.1,
       2023.2, 2023.3, 2023.4, 2023.5, 2023.6, 2023.7, 2023.8, 2023.9])