In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Dimention reduction

## Load data

In [None]:
class_name = 'Cultivars'

feature_cols = [
        'Alcohol',
        'Malic_acid',
        'Ash',
        'Alcalinity_of_ash',
        'Magnesium',
        'Total_phenols',
        'Flavanoids',
        'Nonflavanoid_phenols',
        'Proanthocyanins',
        'Color_intensity',
        'Hue',
        'OD280/OD315_of_diluted_wines',
        'Proline'
    ]

cols = feature_cols.copy()
cols.insert(0, class_name)

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', 
                 header=None, names=cols)

features = df.drop(class_name, axis=1)
target = df[class_name]

df.head()

## PCA

In [None]:
from sklearn.decomposition import PCA

#Fitting the PCA algorithm with our Data
pca = PCA().fit(features)

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Wine Dataset Explained Variance')
plt.show()

In [None]:
def zip_features_and_class(features_, class_):
    df_ = pd.DataFrame({'x': features_[:, 0], 'y': features_[:, 1]})
    result = pd.concat([df_, class_], axis=1, sort=False)
    return result

def plot_reduced_dataset(df_):
    groups = df_.groupby(class_name)

    # Plot
    fig, ax = plt.subplots()
    fig.set_size_inches(15, 7)
    ax.margins(0.05)
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms='7', label=name)
    ax.legend()
    plt.show()

In [None]:
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features)
res_pca = zip_features_and_class(reduced_features, target)
res_pca

In [None]:
plot_reduced_dataset(res_pca)

## TSNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
reduced_features = tsne.fit_transform(features)
res_tsne = zip_features_and_class(reduced_features, target)
res_tsne

In [None]:
plot_reduced_dataset(res_tsne)

# Text processing

In [None]:
from sklearn.datasets import fetch_20newsgroups
train_all = fetch_20newsgroups(subset='train')
train_all.target_names

categories = ['sci.crypt', 'sci.space', 'comp.windows.x']
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [None]:
print("\n".join(data_train.data[0].split("\n")[:3]))
print()
print(data_train.target_names[data_train.target[0]])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(lowercase=True, stop_words='english')
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def report(actual, predicted):
    print('Accuracy: {0}\n'.format(accuracy_score(actual, predicted)))
    print('Confusion matrix:\n\n {0}\n'.format(confusion_matrix(actual, predicted)))
    print('Classification report:\n\n {0}'.format(classification_report(actual, predicted)))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train_tfidf, data_train.target)

In [None]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

docs_test = twenty_test.data
docs_test = count_vect.transform(docs_test)
docs_test = tfidf_transformer.transform(docs_test)
print(docs_test.shape)
print()

predicted = rfc.predict(docs_test)
report(twenty_test.target, predicted)

In [None]:
import multidict as multidict

import os
import re
from PIL import Image
from os import path
from wordcloud import WordCloud


def getFrequencyDictForText(texts):
    fullTermsDict = multidict.MultiDict()
    tmpDict = {}

    # making dict for counting frequencies
    for t in texts:
        for text in t.split(" "):
            if re.match("a|the|an|the|to|in|for|of|or|by|with|is|on|that|be", text):
                continue
            val = tmpDict.get(text, 0)
            tmpDict[text.lower()] = val + 1
    for key in tmpDict:
        fullTermsDict.add(key, tmpDict[key])
    return fullTermsDict
        

def makeImage(text):
    wc = WordCloud(background_color="white", max_words=1000)
    # generate word cloud
    wc.generate_from_frequencies(text)

    # show
    plt.figure(figsize=(15,15))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
makeImage(getFrequencyDictForText(data_train.data))

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD()
X_train_svd = svd.fit_transform(X_train_tfidf)
X_train_svd

In [None]:
target_named = [data_train.target_names[target] for target in data_train.target]

text_df = zip_features_and_class(X_train_svd, pd.DataFrame({'topic':target_named}))

groups = text_df.groupby('topic')

# Plot
fig, ax = plt.subplots()
fig.set_size_inches(15, 7)
ax.margins(0.05)
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms='7', label=name)
ax.legend()
plt.show()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

kmeans.fit(X_train_svd)
y_kmeans = kmeans.predict(X_train_svd)

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(X_train_svd[:, 0], X_train_svd[:, 1], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)


In [None]:
groups = df_.groupby(class_name)

# Plot
fig, ax = plt.subplots()
fig.set_size_inches(15, 7)
ax.margins(0.05)
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms='7', label=name)
ax.legend()
plt.show()