# This data set gives average masses for women as a function of their height in a sample of American women of age 30–39. So I will use a simple linear regression model to see if I can predict the weight of a woman given her height.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

print(os.listdir("../input"))

# Proper data is in the input folder, so let's read it now.

In [None]:
df = pd.read_csv("../input/data.csv")
df.head(5)

# Let's check for null values.

In [None]:
df.isnull().sum()

# No nulls! This likely means this is a simple dataset created for practice exericises. Let's take a quick look at a plot of the data.

In [None]:
plt.title("American Women")
plt.xlabel("Height")
plt.ylabel("Weight")
plt.scatter(df.Height,df.Weight,color='blue')

# Split the data into an x dataframe with the height and a y dataframe with the weight, which will be our target.

In [None]:
x = df[['Height']]
y = df['Weight']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
print('x_train shape:', x_train.shape)
print('y_train shape', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape', y_test.shape)
print('percent in x_train:', x_train.shape[0]/(x_train.shape[0] + x_test.shape[0]))
print('percent in x_test:', x_test.shape[0]/(x_train.shape[0] + x_test.shape[0]))

# Good split confirmed. Now let's fit a linear regression model.

In [None]:
model = LinearRegression()
model.fit(x_train,y_train)

# And check the accuracy of our model.

In [None]:
model.score(x_test,y_test)

# 97% is excellent. Let's see the model in action.

In [None]:
df.iloc[8]

In [None]:
print(model.predict([[1.68]]))

# Very close!