# String Indexer

- `String Indexer` in PySpark is same as `OneHotEncoding`in Scikit Learn

### <pre>1. Starting the PySpark Session

In [5]:
import warnings 
warnings.filterwarnings('ignore')

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tutorial_5').getOrCreate()

In [7]:
spark

### <pre>2. Loading the Data

In [10]:
import seaborn as sns

#pandas dataframe
df = sns.load_dataset('tips')

# spark dataframe
tips = spark.createDataFrame(df)

In [12]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### <pre>3.String Indexing - Converting the categorical data into numeric one

In [14]:
tips.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [15]:
#creating a list of columns that are categorical in nature
catg_cols = [col_name for col_name, data_type in tips.dtypes if 'string' in data_type]

In [16]:
catg_cols

['sex', 'smoker', 'day', 'time']

In [19]:
from pyspark.ml.feature import StringIndexer

index = StringIndexer(inputCols = catg_cols,
                     outputCols = [col_name + '_encoded' for col_name in catg_cols])

new_tips = index.fit(tips).transform(tips)

In [20]:
new_tips.show(5)

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_encoded|smoker_encoded|day_encoded|time_encoded|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
only showing top 5 rows



In [21]:
new_tips = new_tips.drop(*catg_cols)

In [22]:
new_tips.show(5)

+----------+----+----+-----------+--------------+-----------+------------+
|total_bill| tip|size|sex_encoded|smoker_encoded|day_encoded|time_encoded|
+----------+----+----+-----------+--------------+-----------+------------+
|     16.99|1.01|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|   4|        1.0|           0.0|        1.0|         0.0|
+----------+----+----+-----------+--------------+-----------+------------+
only showing top 5 rows

