# 엔그램 

### 데이터 준비

In [8]:
# train set과 test set이 분리되어 있음
from sklearn.datasets import fetch_20newsgroups

# 원하는 토픽만 선택하기
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

In [9]:
print('train set size:', len(newsgroups_train.data))
print('test set size:', len(newsgroups_test.data))
print('selected categories:', newsgroups_train.target_names)
print('train labels:', set(newsgroups_train.target))

train set size: 2034
test set size: 1353
selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
train labels: {0, 1, 2, 3}


In [10]:
print('##Train set text samples:', newsgroups_train.data[0])
print('##Train set label smaples:', newsgroups_train.target[0])
print('##Test set text samples:', newsgroups_test.data[0])
print('##Test set label smaples:', newsgroups_test.target[0])

##Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
##Train set label smaples: 1
##Test set text samples: TRry the SKywatch project in  Arizona.
##Test set label smaples: 2


### text document classification을 수행하기

In [11]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\82104\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", 
                        decode_error ='ignore', 
                        lowercase=True, 
                        stop_words = stopwords.words('english'), 
                        max_df=0.5,
                        min_df=2).fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 11483)


In [19]:
from sklearn.linear_model import LogisticRegression 
#분류기 선언
clf = LogisticRegression() 

# train data를 이용하여 분류기를 학습
clf.fit(X_train_tfidf, y_train)

# train data에 대한 예측정확도 
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) 

# test data에 대한 예측정확도
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) 

Train set score: 0.966
Test set score: 0.761


### Bigram

In [20]:
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", 
                        decode_error ='ignore', 
                        lowercase=True, 
                        stop_words = stopwords.words('english'),
                        ngram_range=(1, 2),
                        max_df=0.5,
                        min_df=2).fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 26550)


In [21]:
bigram_features = [f for f in tfidf.get_feature_names() if len(f.split()) > 1]
print(bigram_features[:10])

["'cause can't", "'em better", "'expected errors'", "'karla' next", "'nodis' password", "'official doctrine", "'ok see", "'sci astro'", "'what's moonbase", 'aas american']


In [22]:
#분류기 선언
clf = LogisticRegression() 

# train data를 이용하여 분류기를 학습
clf.fit(X_train_tfidf, y_train)

# train data에 대한 예측정확도 
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train)))

# test data에 대한 예측정확도
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) 

Train set score: 0.969
Test set score: 0.756


### Trigram

In [23]:
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", 
                        decode_error ='ignore', 
                        lowercase=True, 
                        stop_words = stopwords.words('english'),
                        ngram_range=(1, 3),
                        max_df=0.5,
                        min_df=2).fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 32943)


In [24]:
trigram_features = [f for f in tfidf.get_feature_names() if len(f.split()) > 2]
print(trigram_features[:10])



In [26]:
#분류기 선언
clf = LogisticRegression() 

# train data를 이용하여 분류기를 학습
clf.fit(X_train_tfidf, y_train) 

# train data에 대한 예측정확도 
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) 

# test data에 대한 예측정확도
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) 

Train set score: 0.969
Test set score: 0.758


### Ridge

In [27]:
from sklearn.linear_model import RidgeClassifier
ridge_clf = RidgeClassifier() #릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) #학습
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.976
Test set score: 0.775


### Lasso

In [28]:
import numpy as np
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear')
# train data로 학습
lasso_clf.fit(X_train_tfidf, y_train) 
print('Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))
print('Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1]) 

Train set score: 0.761
Test set score: 0.695
Used features count: 246 out of 32943


### SVM

In [29]:
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(X_train_tfidf, y_train) 

# train data에 대한 예측정확도
print('Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) 
# test data에 대한 예측정확도
print('Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test))) 

Train set score: 0.974
Test set score: 0.758
