In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv('/kaggle/input/testing/final_output_processed.csv')

# Select relevant columns
df = df[['salary', 'orgTags_SKILLS', 'position_name','position_workType']]

# Drop rows with missing values
df = df.dropna()

# Remove rows where 'orgTags_SKILLS' or 'salary' is 'NOT FOUND'
df = df[(df['orgTags_SKILLS'] != 'NOT FOUND') & (df['salary'] != 'NOT FOUND')]
df = df[(df['position_workType'] != 'NOT FOUND')]

# Remove rows where 'salary' contains letters
df = df[df['salary'].str.contains('[a-zA-Z]')==False]

# Convert 'salary' to float and then integer
df['salary'] = df['salary'].astype(float).astype(int)

# Remove rows where 'salary' is 0 or greater than 20000
df = df[(df['salary'] != 0) & (df['salary'] < 20000)]

# Combine 'orgTags_SKILLS' and 'position_name' into a single text representation
df['combined_text'] = df['orgTags_SKILLS'] + ' ' + df['position_name']+ ' ' + df['position_workType']

# TF-IDF vectorization on the combined text
tfidf_vectorizer = TfidfVectorizer()
combined_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Split the data
X = combined_matrix
y = df['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


  df = pd.read_csv('/kaggle/input/testing/final_output_processed.csv')


Mean Squared Error: 7510424.480354067


In [44]:
def predict_salary(input_data):
    # Preprocess the input data
    combined_text = input_data['orgTags_SKILLS'] + ' ' + input_data['position_name'] + ' ' + input_data['position_workType']

    # TF-IDF vectorization
    input_matrix = tfidf_vectorizer.transform([combined_text])

    # Predict salary
    predicted_salary = model.predict(input_matrix)

    return predicted_salary[0]


# Example input data
input_data = {
    'orgTags_SKILLS': 'python,java,ai',
    'position_name': 'data scientist',
    'position_workType': 'internship'
}

# Predict salary
predicted_salary = predict_salary(input_data)
print("Predicted Salary:", predicted_salary)


Predicted Salary: 9385.196726492151


In [47]:
import pickle

model_file_path = 'linear_regression_tfidf.pkl'

# Pickle the model
with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)

print("Model pickled successfully!")

Model pickled successfully!
