In [1]:
# Example: Linear Regression with Spark MLlib
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Initialize Spark Session
spark = SparkSession.builder.appName('MLlib Example').getOrCreate()

# Load sample data
data = [(1, 5.0, 20.0), (2, 10.0, 25.0), (3, 15.0, 30.0), (4, 20.0, 35.0)]
columns = ['ID', 'Feature', 'Target']
df = spark.createDataFrame(data, columns)

# Prepare data for modeling
assembler = VectorAssembler(inputCols=['Feature'], outputCol='Features')
df_transformed = assembler.transform(df)

# Train a linear regression model
lr = LinearRegression(featuresCol='Features', labelCol='Target')
model = lr.fit(df_transformed)

# Print model coefficients
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 17:07:53 WARN Utils: Your hostname, rasyad-VirtualBox, resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/12/07 17:07:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 17:08:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 17:08:13 WARN Instrumentation: [cc30080e] regParam is zero, which might cause numerical instability and overfitting.
25/12/07 17:08:15 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/12/07 17:08:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netl

Coefficients: [0.9999999999999992]
Intercept: 15.000000000000009


In [3]:
# Practice: Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('Logistic Reggression').getOrCreate()

# Example dataset
data = [(1, [2.0, 3.0], 0), (2, [1.0, 5.0], 1), (3, [2.5, 4.5], 1), (4, [3.0, 6.0], 0)]
columns = ['ID', 'Features', 'Label']
df = spark.createDataFrame(data, columns)

# Instead of using 'Features' directly, we need to access the elements within the array
# Create new columns for 'Features[0]' and 'Features[1]' using Spark functions
df = df.withColumn('Feature0', col('Features').getItem(0)) \
       .withColumn('Feature1', col('Features').getItem(1))

# Now use VectorAssembler with the new columns
assembler = VectorAssembler(inputCols=['Feature0', 'Feature1'], outputCol='FeaturesVector')
df = assembler.transform(df)

# Train logistic regression model using the 'FeaturesVector' column
lr = LogisticRegression(featuresCol='FeaturesVector', labelCol='Label')
model = lr.fit(df)

# Display coefficients and summary
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')

25/12/07 17:10:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Coefficients: [-12.262057929987803,4.087352266753044]
Intercept: 11.568912727474402


In [4]:
# Practice: KMeans Clustering
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Example dataset
data = [(1, [1.0, 1.0]), (2, [5.0, 5.0]), (3, [10.0, 10.0]), (4, [15.0, 15.0])]
columns = ['ID', 'Features']
df = spark.createDataFrame(data, columns)

from pyspark.sql.functions import col
df = df.withColumn('Feature1', col('Features').getItem(0))

assembler = VectorAssembler(inputCols=['Feature1'], outputCol='Features_vec')
df = assembler.transform(df)

# Train KMeans clustering model
kmeans = KMeans(featuresCol='Features_vec', k=2)
model = kmeans.fit(df)

# Show cluster centers
centers = model.clusterCenters()
print(f'Cluster Centers: {centers}')

Cluster Centers: [array([5.33333333]), array([15.])]


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier # Algoritma Klasifikasi yang Lebih Kuat
from sklearn.metrics import accuracy_score, classification_report
import warnings

df = pd.read_csv("gta_cars.csv")

df['Price_Clean_Str'] = df['Price'].astype(str).str.split('#').str[0]
df['Price_Clean_Str'] = df['Price_Clean_Str'].str.replace('[^0-9,]', '', regex=True)
df['Price_Clean_Str'] = df['Price_Clean_Str'].str.replace(',', '')
df['Price_Numeric'] = df['Price_Clean_Str'].replace('', np.nan).astype(float)
df['Capacity_Numeric'] = df['Capacity'].astype(str).str.extract(r'(\d+)').astype(float)

# Ganti nama kolom kelas kendaraan
df.rename(columns={'Vehicle class(GTA V/GTA Online)': 'Vehicle_Class'}, inplace=True)

# Menjatuhkan baris dengan nilai hilang (juga di 'Body style')
df_clean = df.dropna(subset=['Price_Numeric', 'Capacity_Numeric', 'Vehicle_Class', 'Body style']).copy()

# Encoding Variabel Target (y)
le = LabelEncoder()
df_clean['label'] = le.fit_transform(df_clean['Vehicle_Class'])

# Tentukan Fitur BARU (X) dan Target (y)
feature_cols = ['Price_Numeric', 'Capacity_Numeric', 'Body style'] # Tambah Body style
X = df_clean[feature_cols]
y = df_clean['label']
target_names = le.classes_

# Split data (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build Pipeline dengan Feature Engineering Lanjutan 

numeric_features = ['Price_Numeric', 'Capacity_Numeric']
categorical_features = ['Body style'] # Fitur baru

#  Standardisasi untuk numerik, One-Hot Encoding untuk kategorikal
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        # One-Hot Encoding untuk Body style
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Pipeline: Preprocessing -> Random Forest Classifier (Lebih Kuat)
# RandomForest tidak memerlukan regulasi (C atau penalty) seperti Logistic Regression
rf = RandomForestClassifier(random_state=42)
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', rf)])

param_grid_rf = {
    'classifier__n_estimators': [50, 100], # Jumlah trees
    'classifier__max_depth': [5, 10] # Kedalaman 
}

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    cv_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
    cv_model_rf = cv_rf.fit(X_train, y_train)

# Evaluasi Model Random Forest
y_pred_rf = cv_model_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Menangani masalah mismatch kelas
unique_labels_rf = np.unique(np.concatenate([y_test, y_pred_rf]))
filtered_target_names_rf = [target_names[i] for i in unique_labels_rf]

print("Hasil Model Random Forest")
print(f"Test Set Accuracy: {accuracy_rf:.4f}")
print("\nModel Parameter Terbaik:")
print(f"  - n_estimators: {cv_model_rf.best_params_['classifier__n_estimators']}")
print(f"  - max_depth: {cv_model_rf.best_params_['classifier__max_depth']}")

print("\nClassification Report (Accuracy per class):")
print(classification_report(y_test, y_pred_rf, labels=unique_labels_rf, target_names=filtered_target_names_rf, zero_division=0))

Hasil Model Random Forest
Test Set Accuracy: 0.4583

Model Parameter Terbaik:
  - n_estimators: 100
  - max_depth: 10

Classification Report (Accuracy per class):
                 precision    recall  f1-score   support

     Commercial       1.00      0.33      0.50         3
       Compacts       0.00      0.00      0.00         3
         Coupes       0.00      0.00      0.00         5
         Cycles       0.00      0.00      0.00         1
      Emergency       0.00      0.00      0.00         2
     Industrial       0.00      0.00      0.00         2
       Military       1.00      0.40      0.57         5
    Motorcycles       0.38      0.89      0.53         9
         Muscle       1.00      0.31      0.48        16
       Off-Road       0.50      0.62      0.55        13
     Open Wheel       0.00      0.00      0.00         1
           SUVs       0.80      0.57      0.67         7
         Sedans       0.56      0.83      0.67         6
        Service       0.00      0.00  