In [None]:
# importing necessary libraries
import numpy as np
import scipy.stats as spstats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder , OneHotEncoder


%matplotlib inline
# matplotlib
mpl.style.reload_library()
mpl.style.use('classic')
mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
mpl.rcParams['figure.figsize'] = [6.0, 4.0]
mpl.rcParams['figure.dpi'] = 100

In [None]:
np.linspace(0,0.5,12)

In [None]:
# Reading in the dataset
poke_df = pd.read_csv("pokemon.csv")
poke_df.head()

In [None]:
poke_df[["hp","attack","defense"]].describe()

In [None]:
atk_def = poke_df[['attack', 'defense']]

In [None]:
pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
res = pf.fit_transform(atk_def)
res

In [None]:
pd.unique(poke_df["generation_id"])

In [None]:
# gen_le = LabelEncoder()
# genre_label = gen_le.fit_transform(vg_df["Genre"])
# vg_df["Genre_label"] = genre_label

In [None]:
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
# encode legendary status labels using one-hot encoding scheme

In [None]:
vg_df = pd.read_csv("vgsales.csv", encoding="utf-8")
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

In [None]:
genres = pd.unique(vg_df["Genre"])
genres

In [None]:
gle = LabelEncoder()
gle.fit_transform(vg_df["Genre"])
genre_labels = {index: label for index, label in enumerate(gle.classes_)}
genre_labels

In [None]:
import numpy as np
import pandas as pd
import re
import nltk

In [None]:
corpus = ['The sky is blue and beautiful.',
             'Love this blue and beautiful sky!',
             'The quick brown fox jumps over the lazy dog.',
             'The brown fox is quick and the blue dog is lazy!',
             'The sky is very blue and the sky is very beautiful today',
             'The dog is lazy but the brown fox is quick!']

labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']

corpus  = np.array(corpus)

In [None]:
corpus_df = pd.DataFrame({'Document': corpus,
                         "Category": labels})
corpus_df = corpus_df[["Document", "Category"]]
corpus_df

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words("english")
def normalize_document(doc):
       # lower case and remove special characters\whitespaces
       doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
       doc = doc.lower()
       doc = doc.strip()
       # tokenize document
       tokens = wpt.tokenize(doc)
       # filter stopwords out of document
       filtered_tokens = [token for token in tokens if token not in stop_words]
       # re-create document from filtered tokens
       doc = ' '.join(filtered_tokens)
       return doc
    
    
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df = 0., max_df = 1.)

cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

In [None]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

In [None]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
bv_matrix

In [None]:
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(tv_matrix, columns=vocab)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity =  cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(cosine_similarity)
similarity_df

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda =  LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=["T1", "T2"])
features

In [None]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab,topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()

In [None]:
from gensim.models import word2vec
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

feature_size = 10
window_context = 10
sample = 1e-3
min_word_count = 1

w2v_model = word2vec.Word2Vec(tokenized_corpus, vector_size=feature_size, window=window_context,
                              min_count=min_word_count, sample=sample)
w2v_model.wv['sky']

In [None]:
import datetime
import numpy as np
import pandas as pd
from dateutil.parser import parser
import pytz

In [None]:
time_stamps = ['2015-03-08 10:30:00.360000+00:00', '2017-07-13 15:45:05.755000-07:00',
'2012-01-20 22:30:00.254000+05:30', '2016-12-25 00:30:00.000000+10:00']

df = pd.DataFrame(time_stamps, columns=["Time"])
df

In [None]:
ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])
df["TS_objs"] = ts_objs
df

In [None]:
df["Year"] = df["TS_objs"].apply(lambda x:x.year)
df["Month"] = df["TS_objs"].apply(lambda x:x.month)
df["Day"] = df["TS_objs"].apply(lambda x:x.day)
df["DayOfWeek"] = df["TS_objs"].apply(lambda x:x.dayofweek)
df["DayName"] = df["TS_objs"].apply(lambda x:x.day_name().title())
df["WeekOfYear"] = df["TS_objs"].apply(lambda x:x.weekofyear)
df["Quarter"] = df["TS_objs"].apply(lambda x:x.quarter)
df["DayOfYear"] = df["TS_objs"].apply(lambda x:x.dayofyear)



df["Hour"] = df["TS_objs"].apply(lambda x:x.hour)
df["Minute"] = df["TS_objs"].apply(lambda x:x.minute)
df["Seconds"] = df["TS_objs"].apply(lambda x:x.second)
df["MUseconds"] = df["TS_objs"].apply(lambda x:x.microsecond)
df["UTCoffset"] = df["TS_objs"].apply(lambda x:x.utcoffset())


df

In [None]:
hour_bins = [-1, 5, 11, 16, 21, 23]
day_bin = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']
df["TimeOfDayBin"] = pd.cut(df["Hour"], bins=hour_bins, labels=day_bin)
df

In [None]:
df["TZ_info"] = df["TS_objs"].apply(lambda x:x.tzinfo)
df['TimeZones'] = df['TS_objs'].apply(lambda d: list({d.astimezone(tz).tzname()
                                                     for tz in map(pytz.timezone,
                                                    pytz.all_timezones_set)
                                                     if d.astimezone(tz).utcoffset() == d.utcoffset()}))
df

In [None]:
df['TimeUTC'] = df['TS_objs'].apply(lambda d: d.tz_convert(pytz.utc))
df['Epoch'] = df['TS_objs'].apply(lambda d: d.timestamp())
df['GregOrdinal'] = df['TS_objs'].apply(lambda d: d.toordinal())

df

In [None]:
import datetime
curr_ts = datetime.datetime.now(pytz.utc)
df['DaysElapsedEpoch'] = (curr_ts.timestamp() - df['Epoch']) / (3600*24)
df['DaysElapsedEpoch'] = (curr_ts.toordinal() - df['GregOrdinal'])

df

In [None]:
import skimage
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io
%matplotlib inline

In [None]:
c = io.imread('minion.png')
d = io.imread('minion_2.png')
df = pd.DataFrame(['c', 'd'], columns=['Image'])
df

In [None]:
print(f"cat image shape:\n {c.shape}")
print(f"dog image shape:\n {d.shape}")


fig = plt.figure(figsize=(5,8))
axs_1 = fig.add_subplot(1,2,1)
axs_1.imshow(c)
axs_2 = fig.add_subplot(1,2,2)
axs_2.imshow(d)

In [None]:
c_red = c.copy() # Red Channel
c_red[:,:,1] = c_red[:,:,2] = 0 # set G,B pixels = 0
c_green = c.copy() # Green Channel
c_green[:,:,0] = c_green[:,:,2] = 0 # set R,B pixels = 0
c_blue = c.copy() # Blue Channel
c_blue[:,:,0] = c_blue[:,:,1] = 0 # set R,G pixels = 0

image = np.concatenate((c_red, c_green, c_blue), axis=1)
plt.figure(figsize=(10,8))
plt.imshow(image)

In [None]:
from skimage.color import rgb2gray

c_gs = rgb2gray(c)
d_gs = rgb2gray(d)

print("Image shape:\n", c_gs.shape)
print("2D Image pixel:\n", np.round(c_gs, 2))
print("flattened image:\n",np.round(c_gs.flatten(),2))

In [None]:
fig = plt.figure(figsize = (8,4))
ax1 = fig.add_subplot(2,2, 1)
ax1.imshow(c_gs, cmap="gray")
ax2 = fig.add_subplot(2,2, 2)
ax2.imshow(d_gs, cmap='gray')
ax3 = fig.add_subplot(2,2, 3)
c_freq, c_bins, c_patches = ax3.hist(c_gs.flatten(), bins=30)
ax4 = fig.add_subplot(2,2, 4)
d_freq, d_bins, d_patches = ax4.hist(d_gs.flatten(), bins=30)

In [None]:
pd.Timestamp().m

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [None]:
x = np.random.normal(loc=1,scale=2,size=(2,3))
x

In [None]:
y = np.random.binomial(n=10, p=0.5, size=(1000))
sns.distplot(y,kde=False)
y

In [None]:
sns.distplot(np.random.poisson(lam=2, size=1000))